diff --git a/samples/cpp/text_generation/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt index ebaf32c7f4..f97cd4356c 100644 --- a/samples/cpp/text_generation/CMakeLists.txt +++ b/samples/cpp/text_generation/CMakeLists.txt @@ -9,7 +9,7 @@ find_package(OpenVINOGenAI REQUIRED ) function(add_sample_executable target_name) - add_executable(${target_name} ${target_name}.cpp) + add_executable(${target_name} ${target_name}.cpp read_prompt_from_file.cpp) target_link_libraries(${target_name} PRIVATE openvino::genai) set_target_properties(${target_name} PROPERTIES # Ensure out-of-box LC_RPATH on macOS with SIP @@ -29,7 +29,8 @@ set (SAMPLE_LIST lora_greedy_causal_lm multinomial_causal_lm prompt_lookup_decoding_lm - speculative_decoding_lm) + speculative_decoding_lm + eagle_speculative_lm) foreach(sample IN LISTS SAMPLE_LIST) add_sample_executable(${sample}) diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md index 88e6739242..0e8d6e9717 100644 --- a/samples/cpp/text_generation/README.md +++ b/samples/cpp/text_generation/README.md @@ -230,6 +230,37 @@ Recommended models: `meta-llama/Llama-3.2-1B-Instruct`, `meta-llama/Llama-3.2-8B **Note:** Structured output enforcement ensures valid JSON formatting, but does not guarantee factual accuracy or meaningfulness. The model may generate plausible-looking JSON with incorrect or nonsensical data (e.g., `{"explanation": "John", "output": 200000}` or `{"final_answer": "AbrakaKadabra9999######4242"}`). For best results, use the latest or fine-tuned models to improve output quality and relevance. +### 6. Eagle Speculative LM (`eagle_speculative_lm`) +- **Description:** +EAGLE is a lossless acceleration algorithm for LLM inference. + +- **Convert model** +If you have your own draft model, you can refer to https://jira.devtools.intel.com/browse/CVS-171947 to convert the model. +We currently have a set of converted models which you can download (password: openvino): +``` bash +scp -r openvino-ci-97@10.67.108.171:~/bell/speculative_decoding/eagle3/llama-3.1-8b-instruct-ov-int4/ your_path_to_main/ +scp -r openvino-ci-97@10.67.108.171:~/bell/speculative_decoding/eagle3/EAGLE3-LLaMA3.1-instruct-8B-ov-int4/ your_path_to_draft/ +``` + +- **Run Command:** +Linux: + ```bash + source /setupvars.sh + ./eagle_speculative_lm "" + ``` +Windows: + ```bash + /setupvars.bat + eagle_speculative_lm.exe "" + ``` + +- **Benchmark Tools** +``` bash +scp openvino-ci-97@10.67.108.171:~/xufang/run_eagle_base.py your_path_to_tool +scp -r openvino-ci-97@10.67.108.171:~/xufang/data your_path_to_test_datasets +python run_eagle_base.py +``` + ## Troubleshooting ### Unicode characters encoding error on Windows diff --git a/samples/cpp/text_generation/beam_search_causal_lm.cpp b/samples/cpp/text_generation/beam_search_causal_lm.cpp index 2f50100ac6..8ae3aaf29e 100644 --- a/samples/cpp/text_generation/beam_search_causal_lm.cpp +++ b/samples/cpp/text_generation/beam_search_causal_lm.cpp @@ -15,9 +15,9 @@ int main(int argc, char* argv[]) try { ov::genai::GenerationConfig config; config.max_new_tokens = 20; - config.num_beam_groups = 3; - config.num_beams = 15; - config.diversity_penalty = 1.0f; + config.num_beam_groups = 1; + config.num_beams = 2; + //config.diversity_penalty = 1.0f; config.num_return_sequences = config.num_beams; auto beams = pipe.generate(prompts, config); diff --git a/samples/cpp/text_generation/eagle_speculative_lm.cpp b/samples/cpp/text_generation/eagle_speculative_lm.cpp new file mode 100644 index 0000000000..a0bd30e74b --- /dev/null +++ b/samples/cpp/text_generation/eagle_speculative_lm.cpp @@ -0,0 +1,125 @@ +// Copyright (C) 2023-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/speculative_decoding/perf_metrics.hpp" +#include "read_prompt_from_file.h" + +template +void print_perf_metrics(T& perf_metrics, std::string model_name) { + std::cout << "\n" << model_name << std::endl; + auto generation_duration = perf_metrics.get_generate_duration().mean; + std::cout << " Generate time: " << generation_duration << " ms" << std::endl; + std::cout << " TTFT: " << perf_metrics.get_ttft().mean << " ± " << perf_metrics.get_ttft().std << " ms" + << std::endl; + std::cout << " TPOT: " << perf_metrics.get_tpot().mean << " ± " << perf_metrics.get_tpot().std << " ms/token" + << std::endl; + std::cout << " Num generated token: " << perf_metrics.get_num_generated_tokens() << " tokens" << std::endl; + if (model_name == "Total") { + std::cout << " Total iteration number: " << perf_metrics.raw_metrics.m_new_token_times.size() << std::endl; + } else { + std::cout << " Total iteration number: " << perf_metrics.raw_metrics.m_durations.size() << std::endl; + } + if (perf_metrics.get_num_input_tokens() > 0) { + std::cout << " Input token size: " << perf_metrics.get_num_input_tokens() << std::endl; + } +} + +int main(int argc, char* argv[]) try { + if (6 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); + } + + std::string main_model_path = argv[1]; + std::string eagle_model_path = argv[2]; + int max_new_tokens = atoi(argv[3]); + int depth = atoi(argv[4]); + std::string prompt = argv[5]; + if (std::filesystem::is_regular_file(prompt)) { + std::string prompt_file = prompt; + prompt = utils::read_prompt(prompt_file); + } + + // Configure devices - can run main and eagle models on different devices + std::string main_device = "GPU", eagle_device = "GPU"; // currently only GPU is used during developing + + // Eagle Speculative settings + ov::genai::GenerationConfig config = ov::genai::greedy(); + config.max_new_tokens = max_new_tokens; + // Eagle specific parameters + config.eagle_tree_params.branching_factor = 1; // Number of candidate tokens to consider at each level + config.eagle_tree_params.tree_depth = depth; // How deep to explore the token tree + config.eagle_tree_params.total_tokens = depth + 2; // Total number of tokens to generate in eagle tree + config.num_return_sequences = 1; // only support 1 + + //config.eagle_tree_width = 3; // Number of candidate tokens to consider at each level + //config.eagle_tree_depth = 4; // How deep to explore the token tree + + // Create pipeline with eagle speculative enabled + ov::genai::LLMPipeline pipe( + main_model_path, + main_device, + ov::genai::draft_model(eagle_model_path, eagle_device), + std::pair("eagle_mode", ov::Any("EAGLE3")) // Specify eagle3 mode for draft model + ); + // Setup performance measurement + auto start_time = std::chrono::high_resolution_clock::now(); + + // Optional: Create a streaming callback for real-time token display + auto streamer = [](std::string subword) { + std::cout << subword << std::flush; + return ov::genai::StreamingStatus::RUNNING; + }; + + // Run generation with eagle speculative decoding + std::cout << "Generating with Eagle Speculative decoding:" << std::endl; + auto result = pipe.generate(prompt, config, streamer); + std::cout << std::endl; + + // Calculate and display performance metrics + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time); + std::cout << "\nGeneration completed in " << duration.count() << " ms" << std::endl; + + auto sd_perf_metrics = std::dynamic_pointer_cast(result.extended_perf_metrics); + if (sd_perf_metrics) { + print_perf_metrics(result.perf_metrics, "Total"); + print_perf_metrics(sd_perf_metrics->main_model_metrics, "MAIN MODEL"); + std::cout << " accepted token: " << sd_perf_metrics->get_num_accepted_tokens() << " tokens" << std::endl; + std::cout << " compress rate: " + << sd_perf_metrics->main_model_metrics.get_num_generated_tokens() * 1.0f / + sd_perf_metrics->main_model_metrics.raw_metrics.m_durations.size() + << std::endl; + print_perf_metrics(sd_perf_metrics->draft_model_metrics, "DRAFT MODEL"); + } + std::cout << std::endl; + + // Run without Eagle for comparison + std::cout << "\n-----------------------------" << std::endl; + std::cout << "Generating without Eagle Speculative decoding:" << std::endl; + + // Disable Eagle mode + /*config.eagle_model = false; + + start_time = std::chrono::high_resolution_clock::now(); + pipe.generate(prompt, config, streamer); + std::cout << std::endl; + */ + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time); + std::cout << "\nStandard generation completed in " << duration.count() << " ms" << std::endl; + +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} \ No newline at end of file diff --git a/samples/cpp/text_generation/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp index ca5e193da1..0e0a361254 100644 --- a/samples/cpp/text_generation/greedy_causal_lm.cpp +++ b/samples/cpp/text_generation/greedy_causal_lm.cpp @@ -10,11 +10,18 @@ int main(int argc, char* argv[]) try { std::string models_path = argv[1]; std::string prompt = argv[2]; std::string device = "CPU"; // GPU can be used as well - ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; config.max_new_tokens = 100; - std::string result = pipe.generate(prompt, config); + auto start_time = std::chrono::high_resolution_clock::now(); + auto streamer = [](std::string subword) { + std::cout << subword << std::flush; + return ov::genai::StreamingStatus::RUNNING; + }; + std::string result = pipe.generate(prompt, config, streamer); + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time); + std::cout << "\nGeneration completed in " << duration.count() << " ms" << std::endl; std::cout << result << std::endl; } catch (const std::exception& error) { try { diff --git a/samples/cpp/text_generation/speculative_decoding_lm.cpp b/samples/cpp/text_generation/speculative_decoding_lm.cpp index c659a75fbd..cf155d5646 100644 --- a/samples/cpp/text_generation/speculative_decoding_lm.cpp +++ b/samples/cpp/text_generation/speculative_decoding_lm.cpp @@ -11,21 +11,21 @@ int main(int argc, char* argv[]) try { throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); } - ov::genai::GenerationConfig config; + ov::genai::GenerationConfig config = ov::genai::multinomial(); config.max_new_tokens = 100; // Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded // add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration config.num_assistant_tokens = 5; // add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold` // config.assistant_confidence_threshold = 0.4; - + config.num_return_sequences = 1; std::string main_model_path = argv[1]; std::string draft_model_path = argv[2]; std::string prompt = argv[3]; // User can run main and draft model on different devices. // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft. - std::string main_device = "CPU", draft_device = "CPU"; + std::string main_device = "GPU", draft_device = "GPU"; ov::genai::LLMPipeline pipe( main_model_path, diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 1a84192ead..4361bb7214 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -65,13 +65,17 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { class ContinuousBatchingImpl; class ContinuousBatchingForSpeculativeDecodingImpl; + class ContinuousBatchingForEagleDecodingImpl; class ContinuousBatchingForPromptLookupImpl; class SpeculativeDecodingImpl; + class EagleDecodingImpl; class PromptLookupImpl; friend class ContinuousBatchingForSpeculativeDecodingImpl; + friend class ContinuousBatchingForEagleDecodingImpl; friend class ContinuousBatchingForPromptLookupImpl; friend class SpeculativeDecodingImpl; + friend class EagleDecodingImpl; friend class PromptLookupImpl; std::shared_ptr m_impl; diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 3020be34bc..a006f67a1e 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -326,6 +326,16 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { size_t num_assistant_tokens = 0; size_t max_ngram_size = 0; + // eagle parameters for assisting generation + struct eagle_params { + // eagle/model/cnets.py + // total_tokens = self.total_tokens + // depth = self.depth + // top_k = self.top_k + size_t branching_factor = 1; // top-k + size_t tree_depth = 0; // How deep to look ahead, eagle tree depth, draft will run depth + 1(tree init) levels + size_t total_tokens = 1; // Total number of tokens to generate in eagle tree + } eagle_tree_params; // Structured output parameters std::optional structured_output_config; @@ -346,6 +356,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { bool is_multinomial() const; bool is_assisting_generation() const; bool is_prompt_lookup() const; + bool is_eagle_tree() const; bool is_structured_output_generation() const; OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release") diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index eea94591c3..38f34c49db 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -339,7 +339,17 @@ static constexpr ov::Property scheduler_config{"scheduler_confi static constexpr ov::Property prompt_lookup{"prompt_lookup"}; /** -* @brief enable enable_save_ov_model property serves to serialize ov model (xml/bin) generated from gguf model on disk for re-use. +* @brief enable eagle_mode property serves to activate eagle decoding. +* for eagle2 now +* And create LLMPipeline instance with this config. +*/ +enum class EagleMode { + OFF = 0, // Default mode, no eagle2 optimizations + EAGLE2 = 1 // Enable eagle2 optimizations +}; +static constexpr ov::Property eagle_mode{"eagle_mode"}; + +/* @brief enable enable_save_ov_model property serves to serialize ov model (xml/bin) generated from gguf model on disk for re-use. * Set `true` to activate this mode. * And create LLMPipeline instance with this config. */ diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index 763067e946..565a069803 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -134,7 +134,7 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics { MeanStdPair detokenization_duration = {-1.0f, -1.0f}; size_t num_generated_tokens; - size_t num_input_tokens; + size_t num_input_tokens = 0; float get_load_time(); // Load time in ms. size_t get_num_generated_tokens(); diff --git a/src/cpp/src/continuous_batching/block_manager.hpp b/src/cpp/src/continuous_batching/block_manager.hpp index 75e5ba2bfe..1334a2b550 100644 --- a/src/cpp/src/continuous_batching/block_manager.hpp +++ b/src/cpp/src/continuous_batching/block_manager.hpp @@ -967,7 +967,14 @@ class BlockManager { continue; last_block_ids.insert(last_block_id); - size_t needed_blocks_per_sequence = seq_group->get_num_logical_blocks() - num_physical_blocks; + // here we need to expand to 2 stages: 1 generation stage for causal LLM and num_validate_tokens stage for spec decode + size_t logical_blocks = seq_group->get_num_logical_blocks(); + size_t logical_blocks_with_1_generation_only = logical_blocks; + if (seq_group->get_num_tokens_to_validate() > 0) { + logical_blocks_with_1_generation_only = seq_group->get_num_logical_blocks_for_1_generation(); + } + + size_t needed_blocks_per_sequence = (logical_blocks == logical_blocks_with_1_generation_only ? logical_blocks : logical_blocks_with_1_generation_only) - num_physical_blocks; KVCacheBlock::Ptr last_block = block_table.back(); if (last_block->copy_on_write()) { @@ -981,11 +988,15 @@ class BlockManager { else { blocks_count += needed_blocks_per_sequence * references_count; } - } - else { + } else { // block is used only by one sequence blocks_count += needed_blocks_per_sequence; } + if (seq_group->get_num_tokens_to_validate() > 0) { + // now we need to allocate blocks for num_tokens_to_validate + size_t needed_blocks_extra = logical_blocks - logical_blocks_with_1_generation_only; + blocks_count += needed_blocks_extra * last_block->get_references_count(); + } } return blocks_count; } @@ -1009,7 +1020,29 @@ class BlockManager { } } } + void allocate_slots_for_validation(SequenceGroup::Ptr seq_group) { + std::lock_guard lock(m_cached_blocks_map_mutex); + size_t num_logical_blocks = seq_group->get_num_logical_blocks(); + std::vector running_sequences = seq_group->get_running_sequences(); + + for (size_t i = 0; i < running_sequences.size(); ++i) { + Sequence::Ptr sequence = running_sequences[i]; + auto seq_id = sequence->get_id(); + size_t num_physical_blocks = 0; + if (m_block_table.find(seq_id) != m_block_table.end()) + { + num_physical_blocks = m_block_table[seq_id][0].size(); + } + + if (num_logical_blocks > num_physical_blocks) { + OPENVINO_ASSERT(can_allocate_blocks(num_logical_blocks - num_physical_blocks)); + allocate(sequence, num_logical_blocks - num_physical_blocks, seq_group->get_prompt_len()); + } else { + OPENVINO_ASSERT(num_logical_blocks == num_physical_blocks, "A number of physical and logic blocks must be the same in this code path"); + } + } + } /** * Allocates just enough physical KV cache blocks to a sequence group to be enough for the sequences in it. If the sequences @@ -1023,7 +1056,7 @@ class BlockManager { std::lock_guard lock(m_cached_blocks_map_mutex); // Will always allocate the identical number of new blocks (if any) to each of the "layers" to keep the // number of blocks occupied by each "layer" identical at all times. - size_t num_logical_blocks = seq_group->get_num_logical_blocks(); + size_t num_logical_blocks = seq_group->get_num_tokens_to_validate() > 0 ? seq_group->get_num_logical_blocks_for_1_generation() :seq_group->get_num_logical_blocks(); std::vector running_sequences = seq_group->get_running_sequences(); std::map> copy_blocks_map; diff --git a/src/cpp/src/continuous_batching/cache_manager.hpp b/src/cpp/src/continuous_batching/cache_manager.hpp index 8bb651792e..c288ba6d02 100644 --- a/src/cpp/src/continuous_batching/cache_manager.hpp +++ b/src/cpp/src/continuous_batching/cache_manager.hpp @@ -234,8 +234,9 @@ class CacheManager { for (const auto & blocks_pair : block_copy_map) { size_t src_block_id = blocks_pair.first; const std::list& dst_block_ids = blocks_pair.second; + size_t decoder_layer_id = 0; for (size_t dst_block_id : dst_block_ids) { - for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) { + //for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) { ov::Shape key_shape = set_kv_blocks(m_key_shapes[decoder_layer_id], m_num_allocated_kv_blocks); ov::Shape value_shape = set_kv_blocks(m_value_shapes[decoder_layer_id], m_num_allocated_kv_blocks); ov::Coordinate key_src_start_roi(key_shape.size(), 0); @@ -284,7 +285,9 @@ class CacheManager { ov::Tensor value_dst_cache_roi(m_value_cache[decoder_layer_id], value_dst_start_roi, value_dst_end_roi); value_src_cache_roi.copy_to(value_dst_cache_roi); } - } + if (decoder_layer_id++ == m_num_decoder_layers - 1) + decoder_layer_id = 0; // reset to 0 if we have more blocks than layers + //} } } } diff --git a/src/cpp/src/continuous_batching/model_runner.hpp b/src/cpp/src/continuous_batching/model_runner.hpp index 1c757da35a..e18f3a2b19 100644 --- a/src/cpp/src/continuous_batching/model_runner.hpp +++ b/src/cpp/src/continuous_batching/model_runner.hpp @@ -35,7 +35,6 @@ class ModelRunner { size_t m_num_decoder_layers; bool m_collect_attention_scores; bool m_is_use_per_layer_cache_control; - bool m_is_use_rotation_inputs; std::vector>> m_rotated_block_logical_indices_per_sequence_for_each_layer; std::vector m_cache_rotation_deltas_for_each_layer; @@ -48,7 +47,12 @@ class ModelRunner { // Input shape: [N, conversation length]. // Output shape: [1, conversation length, hidden_size]. EmbeddingsModel::Ptr m_embedding; - + bool m_is_hidden_state_export_needed = false; // need to export hidden state after inference + bool m_is_hidden_state_import_needed = false; // need to import hidden state from another model runner + bool m_is_hidden_state_internal_needed = false; // need to use internal hidden state, e.g, eagle2 + std::map, std::pair> m_sequence_hidden_state_mapping; // pre-requisite: main/draft have same seq group and running seq grouped id + // a container which use sequence group id and request id as key to store hidden states + std::map m_initial_hidden_states; // shape: [N, seq_len, hidden_size] public: /** * Constructs the ModelRunner. @@ -95,6 +99,18 @@ class ModelRunner { return m_request; } + void set_hidden_state_export_needed(bool is_needed) { + m_is_hidden_state_export_needed = is_needed; + } + + void set_hidden_state_import_needed(bool is_needed) { + m_is_hidden_state_import_needed = is_needed; + } + + void set_hidden_state_internal_needed(bool is_needed) { + m_is_hidden_state_internal_needed = is_needed; + } + void set_embedding_model(const EmbeddingsModel::Ptr& embedder) { m_embedding = embedder; } @@ -121,6 +137,47 @@ class ModelRunner { m_cache_rotation_deltas_for_each_layer = std::move(rotation_deltas_for_each_layer); } + ov::Tensor get_hidden_state(size_t request_id, size_t seq_grouped_id) const { + if (m_hidden_states.get_size() == 0) { + return ov::Tensor(); + } + + auto key = std::make_pair(request_id, seq_grouped_id); + auto it = m_sequence_hidden_state_mapping.find(key); + if (it == m_sequence_hidden_state_mapping.end()) { + return ov::Tensor(); + } + + size_t start_idx = it->second.first; + size_t length = it->second.second; + + auto shape = m_hidden_states.get_shape(); + if (shape.size() < 2) { + return ov::Tensor(); + } + + size_t hidden_size = shape[shape.size() - 1]; + + ov::Coordinate start_coord(shape.size(), 0); + ov::Coordinate end_coord(shape.size(), 0); + + start_coord[0] = start_idx; + end_coord[0] = start_idx + length; + + for (size_t i = 1; i < shape.size(); ++i) { + start_coord[i] = 0; + end_coord[i] = shape[i]; + } + + return ov::Tensor(m_hidden_states, start_coord, end_coord); + } + + void set_initial_hidden_state(size_t request_id, const ov::Tensor& hidden_state) { + // m_initial_hidden_states.clear(); + //auto key = std::make_pair(request_id, seq_grouped_id); + m_initial_hidden_states[request_id] = hidden_state; + } + /** * Runs the forward inference call on the underlying LLM's ov::InferRequest, scheduling for inferencing tokens for given sequences * taking into account the supplied scheduler output struct. @@ -129,6 +186,8 @@ class ModelRunner { * @return An ov::Tensor with next-token logit scores for each sequence processed during this `forward` call. */ ov::Tensor forward(const std::vector & sequence_groups, const Scheduler::Output& scheduler_output) { + m_sequence_hidden_state_mapping.clear(); + size_t num_sequence_groups = scheduler_output.m_scheduled_sequence_groups_ids.size(); size_t batch_size_in_sequences = 0; @@ -164,6 +223,29 @@ class ModelRunner { // block_indices are handled in a special fashion below block_indices_begins(ov::element::i32, {batch_size_in_sequences + 1}), max_context_len(ov::element::i32, {}); + ov::Tensor hidden_state_input; + float* hidden_state_data = nullptr; + if (m_is_hidden_state_import_needed || m_is_hidden_state_internal_needed) { + if (hidden_size == 0) { + for (const auto& entry : m_initial_hidden_states) { + const auto& stored_hidden_state = entry.second; + if (stored_hidden_state.get_size() > 0) { + auto shape = stored_hidden_state.get_shape(); + if (shape.size() >= 2) { + hidden_size = shape[shape.size() - 1]; + if (!m_is_hidden_state_import_needed) + hidden_size /= 3; + break; + } + } + } + } + if (hidden_size > 0) { + hidden_state_input = ov::Tensor(ov::element::f32, {total_num_tokens, 1, hidden_size}); + hidden_state_data = hidden_state_input.data(); + std::memset(hidden_state_data, 0, total_num_tokens * hidden_size * sizeof(float)); + } + } ov::Tensor score_aggregation_window(ov::element::i32, {batch_size_in_sequences}); @@ -205,6 +287,7 @@ class ModelRunner { matmul_gathering_is_available = true; } catch (const ov::Exception&) {} + size_t current_token_idx = 0; std::map> seq_id_to_skipped_blocks_map; for (size_t i = 0; i < num_sequence_groups; ++i) { @@ -236,6 +319,75 @@ class ModelRunner { output_seq_len = 0; Sequence::CPtr sequence = running_sequences[seq_idx]; + if (m_is_hidden_state_export_needed) { + size_t start_token_idx = current_token_idx; + size_t sequence_length = num_scheduled_tokens; + + auto key = std::make_pair(sequence_group->get_request_id(), sequence->get_grouped_id()); + m_sequence_hidden_state_mapping[key] = std::make_pair(start_token_idx, sequence_length); + } + if (m_is_hidden_state_import_needed && hidden_state_data && hidden_size > 0) { + //auto key = std::make_pair(sequence_group->get_request_id(), sequence->get_grouped_id()); + auto it = m_initial_hidden_states.find(sequence_group->get_request_id()); + + if (it != m_initial_hidden_states.end()) { + const auto& stored_hidden_state = it->second; + + if (stored_hidden_state.get_size() > 0) { + auto stored_shape = stored_hidden_state.get_shape(); + + if (stored_shape.size() >= 2) { + size_t stored_seq_len = stored_shape[0]; + size_t stored_hidden_size = stored_shape[stored_shape.size() - 1]; + + if (stored_hidden_size == hidden_size) { + if (stored_seq_len == total_num_tokens) { + hidden_state_input = stored_hidden_state; // all tokens from eagle is accepted + } else { + size_t copy_length = std::min(stored_seq_len, num_scheduled_tokens); + + size_t source_start_idx = + stored_seq_len >= copy_length ? stored_seq_len - copy_length : 0; + + const float* source_data = stored_hidden_state.data(); + float* target_data = hidden_state_data + current_token_idx * hidden_size; + + for (size_t token_offset = 0; token_offset < copy_length; ++token_offset) { + size_t source_offset = (source_start_idx + token_offset) * hidden_size; + size_t target_offset = token_offset * hidden_size; + + std::copy_n(source_data + source_offset, + hidden_size, + target_data + target_offset); + } + } + } + } + } + } + } else { + // fill hidden_state_data with m_hidden_states + if (hidden_state_data) { + std::memset(hidden_state_data + current_token_idx * hidden_size, + 0, + num_scheduled_tokens * hidden_size * sizeof(float)); + auto hidden_state = running_sequences[seq_idx]->get_hidden_state(); + if (hidden_state.get_size() > 0) { + auto shape = hidden_state.get_shape(); + if (shape.size() >= 2 && shape[shape.size() - 1] == hidden_size) { + size_t seq_len = shape[0]; + size_t copy_length = std::min(seq_len, num_scheduled_tokens); + const float* source_data = hidden_state.data(); + float* target_data = hidden_state_data + current_token_idx * hidden_size; + for (size_t token_offset = 0; token_offset < copy_length; ++token_offset) { + size_t source_offset = (seq_len - token_offset - 1) * hidden_size; + size_t target_offset = token_offset * hidden_size; + std::copy_n(source_data + source_offset, hidden_size, target_data + target_offset); + } + } + } + } + } for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id, ++gathering_current_index) { // compute token for current sequence if (sequence_group_type == SequenceGroupType::TOKENS) { @@ -310,7 +462,7 @@ class ModelRunner { *score_aggregation_window_data = 1; } } - + current_token_idx += num_scheduled_tokens;; position_ids_data += num_scheduled_tokens; past_lens_data += 1; subsequence_begins_data += 1; @@ -329,7 +481,31 @@ class ModelRunner { m_request.set_tensor("token_type_ids", token_type_ids); } } - + if (hidden_state_input && hidden_state_input.get_size() > 0) { + if (m_is_hidden_state_import_needed) { + try { + m_request.set_tensor("target_hidden_state_input", hidden_state_input); + auto shape = hidden_state_input.get_shape(); + shape[-1] = shape [-1]/3; + ov::Tensor fake_tensor = ov::Tensor(hidden_state_input.get_element_type(), shape); + auto fake_data = fake_tensor.data(); + std::memset(fake_data, 0, fake_tensor.get_byte_size()); + m_request.set_tensor("internal_hidden_state_input", fake_tensor); + } catch (const ov::Exception& e) { + } + } else { + try { + m_request.set_tensor("internal_hidden_state_input", hidden_state_input); + auto shape = hidden_state_input.get_shape(); + shape[-1] = shape [-1] * 3; + ov::Tensor fake_tensor = ov::Tensor(hidden_state_input.get_element_type(), shape); + auto fake_data = fake_tensor.data(); + std::memset(fake_data, 0, fake_tensor.get_byte_size()); + m_request.set_tensor("target_hidden_state_input", fake_tensor); + } catch (const ov::Exception& e) { + } + } + } // typical LLM parameters m_request.set_tensor("position_ids", position_ids); @@ -370,8 +546,24 @@ class ModelRunner { if (m_collect_attention_scores) { _collect_attention_scores(sequence_groups, scheduler_output); } - _reset_cache_rotation_coefficients(); + if (m_is_hidden_state_export_needed) { + try { + m_hidden_states = m_request.get_tensor("last_hidden_state"); + for (size_t i = 0; i < num_sequence_groups; ++i) { + size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i]; + SequenceGroup::Ptr sequence_group = sequence_groups[seq_group_id]; + std::vector running_sequences = sequence_group->get_running_sequences(); + for (size_t seq_idx = 0; seq_idx < running_sequences.size(); ++seq_idx) { + Sequence::Ptr sequence = running_sequences[seq_idx]; + sequence->update_hidden_state( + get_hidden_state(sequence_group->get_request_id(), sequence->get_grouped_id())); + } + } + } catch (const ov::Exception&) { + m_hidden_states = ov::Tensor(); + } + } // return logits return m_request.get_tensor("logits"); @@ -434,6 +626,8 @@ class ModelRunner { } private: + ov::Tensor m_hidden_states; + // Fills indices for sequences in the order defined by scheduler_output void _fill_indices_from_block_tables( const std::vector& dst_tensor_names, diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp index 41cbc0d07b..d833b82789 100644 --- a/src/cpp/src/continuous_batching/pipeline.cpp +++ b/src/cpp/src/continuous_batching/pipeline.cpp @@ -15,7 +15,7 @@ #include "continuous_batching/timer.hpp" #include "utils.hpp" #include "visual_language/inputs_embedder.hpp" - +#include "safe_tensor_wrapper.hpp" using namespace ov::genai; namespace { @@ -28,7 +28,14 @@ extract_draft_model_from_config(ov::AnyMap& config) { } return draft_model; } - +std::string extact_eagle_mode_from_config(ov::AnyMap& config) { + std::string eagle_mode; + if (config.find(utils::EAGLE_MODE) != config.end()) { + eagle_mode = config.at(utils::EAGLE_MODE).as(); + config.erase(ov::genai::eagle_mode.name()); + } + return eagle_mode; +} bool extract_prompt_lookup_from_config(ov::AnyMap& config) { bool res = false; @@ -60,7 +67,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model); auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties); auto generation_config = utils::from_config_json_if_exists(models_path); - + auto eagle_mode = extact_eagle_mode_from_config(properties_without_draft_model); std::shared_ptr embedder; if (std::filesystem::exists(models_path / "openvino_text_embeddings_model.xml")) { embedder = std::make_shared(models_path, device, vision_encoder_properties); @@ -70,6 +77,10 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive"); OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings"); m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties_without_draft_model_without_gguf, generation_config); + } else if (draft_model_desr.model != nullptr && !eagle_mode.empty()) { + OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings"); + auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); + m_impl = std::make_shared(main_model_descr, draft_model_desr, eagle_mode); } else if (draft_model_desr.model != nullptr) { OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings"); auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config); @@ -99,7 +110,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model); auto generation_config = utils::from_config_json_if_exists(models_path); - + auto eagle_mode = extact_eagle_mode_from_config(properties_without_draft_model); std::shared_ptr embedder; if (std::filesystem::exists(models_path / "openvino_text_embeddings_model.xml")) { embedder = std::make_shared(models_path, device, properties_without_draft_model_without_gguf); @@ -109,6 +120,15 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive"); OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings"); m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties_without_draft_model_without_gguf, generation_config); + } else if (draft_model_desr.model != nullptr && !eagle_mode.empty()) { + OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings"); + auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); + m_impl = std::make_shared(main_model_descr, draft_model_desr, eagle_mode); + if (eagle_mode == "EAGLE3") { + // parse d2t from safe tensors + ConstantMap constant_tensors = safetensor_to_constant_map(ov::read_tensor_data(models_path / "eagle3.safetensor")); + std::dynamic_pointer_cast(m_impl)->set_d2t_for_draft_decoding(constant_tensors["d2t"]); + } } else if (draft_model_desr.model != nullptr) { OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings"); auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config); diff --git a/src/cpp/src/continuous_batching/pipeline_impl.cpp b/src/cpp/src/continuous_batching/pipeline_impl.cpp index 1410623160..548f5c6524 100644 --- a/src/cpp/src/continuous_batching/pipeline_impl.cpp +++ b/src/cpp/src/continuous_batching/pipeline_impl.cpp @@ -214,7 +214,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline( /* is_aggregate_attention_scores = */ false, is_use_xattention); } - + m_scheduler->set_validation_mode(m_is_validation_mode_enabled); m_sampler = std::make_shared(m_tokenizer, sampler_num_threads); m_sampler->set_seed(m_generation_config.rng_seed); diff --git a/src/cpp/src/continuous_batching/pipeline_impl.hpp b/src/cpp/src/continuous_batching/pipeline_impl.hpp index 9e6cebdd99..5f5210718f 100644 --- a/src/cpp/src/continuous_batching/pipeline_impl.hpp +++ b/src/cpp/src/continuous_batching/pipeline_impl.hpp @@ -130,7 +130,6 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc bool has_non_finished_requests() override; void step() override; - std::vector generate(const std::vector& input_ids, const std::vector& sampling_params, @@ -141,7 +140,6 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc * Updates LoRA adapters for current generation call */ void set_adapters(const std::optional& adapters); - std::vector get_awaiting_requests(); }; } // namespace ov::genai diff --git a/src/cpp/src/continuous_batching/scheduler.hpp b/src/cpp/src/continuous_batching/scheduler.hpp index 3e565a9d5d..01f47ec597 100644 --- a/src/cpp/src/continuous_batching/scheduler.hpp +++ b/src/cpp/src/continuous_batching/scheduler.hpp @@ -34,6 +34,7 @@ class Scheduler { std::shared_ptr m_cache_manager; size_t m_snapkv_window_size = 1; + bool m_validation_mode_enabled = false; public: struct Output { // IDs of scheduled groups @@ -75,6 +76,10 @@ class Scheduler { m_block_manager.reset(); } + void set_validation_mode(bool is_validation_mode_enabled) { + m_validation_mode_enabled = is_validation_mode_enabled; + } + Output schedule(std::vector& sequence_groups) { Output scheduler_output; // map of src -> dst blocks copies, which need to be performed by CacheManager @@ -354,7 +359,7 @@ class Scheduler { // Question: do we need to schedule preeempted first as it's done in vLLM? // Answer: preempted sequences have low priority, so they should be after "running" ones. So, here we // keep latencies for sequence groups of high priority - if (sequence_group->can_generate_tokens() && !sequence_group->is_waiting() && !sequence_group->handle_stopped() && !sequence_group->handle_cancelled()) { + if (sequence_group->can_generate_tokens() && (!sequence_group->is_waiting() || sequence_group->is_caching()) && !sequence_group->handle_stopped() && !sequence_group->handle_cancelled()) { OPENVINO_ASSERT(!sequence_group->has_finished()); size_t num_running_seqs = sequence_group->num_running_seqs(); OPENVINO_ASSERT(num_running_seqs); @@ -368,8 +373,11 @@ class Scheduler { // Note: current function can return more than 1 token even for generation phase in case of some tokens // of current sequence group were evicted before size_t num_available_tokens_per_seq = sequence_group->get_num_available_tokens_for_batching(); - + // if validation mode, we want to make sure the batch validation is done in one go + if (m_validation_mode_enabled && available_tokens_per_seq_in_megabatch < num_available_tokens_per_seq) + break; size_t num_scheduled_tokens_per_seq = std::min(available_tokens_per_seq_in_megabatch, num_available_tokens_per_seq); + sequence_group->schedule_tokens(num_scheduled_tokens_per_seq); while (!m_block_manager->can_append_slots(sequence_group)){ @@ -389,6 +397,9 @@ class Scheduler { // allocate new slots std::map> copy_blocks_map = m_block_manager->append_slots(sequence_group); + // extra steps for validation mode to make sure we have enough blocks to hold all the scheduled tokens + if (sequence_group->get_num_tokens_to_validate()) + m_block_manager->allocate_slots_for_validation(sequence_group); // add information to scheduler_output { auto request_id = sequence_group->get_request_id(); diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index db7d621ed5..e677c06d90 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -235,17 +235,19 @@ size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const { return max_length - prompt_length; } } - +bool GenerationConfig::is_eagle_tree() const { + return eagle_tree_params.tree_depth > 0; +} bool GenerationConfig::is_greedy_decoding() const { - return !do_sample && !is_beam_search(); + return !do_sample && !is_beam_search() && !is_eagle_tree(); } bool GenerationConfig::is_beam_search() const { - return num_beams > 1; + return num_beams > 1 && !is_eagle_tree(); } bool GenerationConfig::is_multinomial() const { - return do_sample; + return do_sample && !is_eagle_tree(); } bool GenerationConfig::is_speculative_decoding() const { @@ -338,7 +340,7 @@ void GenerationConfig::validate() const { // assistant generation if (is_assisting_generation()) { - OPENVINO_ASSERT(!is_beam_search() && num_return_sequences == 1, "Beam search and parallel sampling are not compatible with assistant generation"); + //OPENVINO_ASSERT(!is_beam_search() && num_return_sequences == 1, "Beam search and parallel sampling are not compatible with assistant generation"); OPENVINO_ASSERT(assistant_confidence_threshold == 0.0f || num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`"); } diff --git a/src/cpp/src/llm/pipeline.cpp b/src/cpp/src/llm/pipeline.cpp index 76d1fe24dc..3ec2944397 100644 --- a/src/cpp/src/llm/pipeline.cpp +++ b/src/cpp/src/llm/pipeline.cpp @@ -47,19 +47,6 @@ std::pair draft_model( return { utils::DRAFT_MODEL_ARG_NAME, Any::make(model, tokenizer, device, plugin_config, scheduler_config, generation_config) }; } -std::pair draft_model( - std::string& model_str, - ov::Tensor& weights_tensor, - const ov::genai::Tokenizer& tokenizer, - const std::string& device, - const ov::AnyMap& properties, - const ov::genai::GenerationConfig& generation_config) { - auto [plugin_config, scheduler_config] = utils::extract_scheduler_config(properties); - - auto model = utils::singleton_core().read_model(model_str, weights_tensor); - return { utils::DRAFT_MODEL_ARG_NAME, Any::make(model, tokenizer, device, plugin_config, scheduler_config, generation_config) }; -} - // Public LLMPipeline ov::genai::LLMPipeline::LLMPipeline( diff --git a/src/cpp/src/lora/adapter.cpp b/src/cpp/src/lora/adapter.cpp index 2b186f3fad..1225912d46 100644 --- a/src/cpp/src/lora/adapter.cpp +++ b/src/cpp/src/lora/adapter.cpp @@ -40,13 +40,10 @@ #include "openvino/genai/lora_adapter.hpp" #include "utils.hpp" +#include "safe_tensor_wrapper.hpp" #include "lora/common.hpp" #include "lora/names_mapping.hpp" -extern "C" { - #include "safetensors.h" -} - // FIXME: Remove or move to a dedicated common header #ifdef NDEBUG #define DEBUG_PRINT(X) do {} while(false) @@ -69,65 +66,6 @@ using ConstantVector = std::vector>; using LoRANode = LoRAParts>; using LoRAPartsParser = LoRAParts(const std::string& name)>>; -// Converts Safetensors element type to OV element type. Only part of the types are supported. -ov::element::Type safetensors_to_ov_element_type (int dtype) { - switch(dtype) { - case SAFETENSORS_F32: - return ov::element::f32; - case SAFETENSORS_F16: - return ov::element::f16; - case SAFETENSORS_BF16: - return ov::element::bf16; - default: - OPENVINO_THROW("Not supported safetensors dtype: ", dtype); - } -} - -using ConstantMap = std::map>; - -// Safetensor file parser that deallocates temporary buffers automatically. -// Drop-in replacement for the third party safetensors_File struct. -struct AutoSafetensor: public safetensors_File { - ~AutoSafetensor () { - std::free(tensors); - std::free(metadata); - } -}; - -// The key in the map is a tensor name and the Constant uses a region of memory from the memory block. -// Each Constant holds a shared pointer to the block in the runtime info. -// The memory block will be deallocated when the last Constant is destroyed. -ConstantMap safetensor_to_constant_map(const ov::Tensor& safetensor) { - AutoSafetensor safe_tensors_file{}; - - OPENVINO_ASSERT(safetensors_file_init(safetensor.data(), safetensor.get_byte_size(), &safe_tensors_file) == nullptr, - "Cannot parse safetensor as a Safetensors file format. Safetensors file format is supported only" - ); - - ConstantMap tensors; - for (int i = 0; i < safe_tensors_file.num_tensors; i++) { - safetensors_TensorDescriptor tensor = safe_tensors_file.tensors[i]; - std::string name(tensor.name.ptr, tensor.name.ptr + tensor.name.len); - ov::Shape shape(tensor.shape, tensor.shape + tensor.n_dimensions); - void* ptr = tensor.ptr; // FIXME: needs a non-constant pointer because Tensor doesn't accept a constant pointer - - auto type = safetensors_to_ov_element_type(tensor.dtype); - auto constant = - std::make_shared(type, shape, ptr, nullptr); // wraps existing memory, no ownership - constant->get_rt_info()["__safetensors_buffer_holder"] = safetensor; // to automatically deallocate underlying memory buffer when last constant that holds it is destroyed - tensors[name] = constant; - } - return tensors; -} - -// Reads a file with a given filename expecting Safetensors file format. -// The file data is mmaped to tensor. -ConstantMap read_safetensors(const std::filesystem::path& filename) { - auto safetensor = ov::read_tensor_data(filename); - - return safetensor_to_constant_map(safetensor); -} - // Default LoRA tensor name patterns observed in the existing LoRA adapters, captures the prefix that should correspond // to a layer name in the base model LoRAPartsParser default_lora_patterns () { diff --git a/src/cpp/src/safe_tensor_wrapper.cpp b/src/cpp/src/safe_tensor_wrapper.cpp new file mode 100644 index 0000000000..2cf69d3283 --- /dev/null +++ b/src/cpp/src/safe_tensor_wrapper.cpp @@ -0,0 +1,47 @@ +#include "safe_tensor_wrapper.hpp" + +ov::element::Type safetensors_to_ov_element_type (int dtype) { + switch(dtype) { + case SAFETENSORS_F32: + return ov::element::f32; + case SAFETENSORS_F16: + return ov::element::f16; + case SAFETENSORS_BF16: + return ov::element::bf16; + case SAFETENSORS_I64: + return ov::element::i64; + case SAFETENSORS_BOOL: + return ov::element::boolean; + default: + OPENVINO_THROW("Not supported safetensors dtype: ", dtype); + } +} + +ConstantMap safetensor_to_constant_map(const ov::Tensor& safetensor) { + AutoSafetensor safe_tensors_file{}; + + OPENVINO_ASSERT(safetensors_file_init(safetensor.data(), safetensor.get_byte_size(), &safe_tensors_file) == nullptr, + "Cannot parse safetensor as a Safetensors file format. Safetensors file format is supported only" + ); + + ConstantMap tensors; + for (int i = 0; i < safe_tensors_file.num_tensors; i++) { + safetensors_TensorDescriptor tensor = safe_tensors_file.tensors[i]; + std::string name(tensor.name.ptr, tensor.name.ptr + tensor.name.len); + ov::Shape shape(tensor.shape, tensor.shape + tensor.n_dimensions); + void* ptr = tensor.ptr; // FIXME: needs a non-constant pointer because Tensor doesn't accept a constant pointer + + auto type = safetensors_to_ov_element_type(tensor.dtype); + auto constant = + std::make_shared(type, shape, ptr, nullptr); // wraps existing memory, no ownership + constant->get_rt_info()["__safetensors_buffer_holder"] = safetensor; // to automatically deallocate underlying memory buffer when last constant that holds it is destroyed + tensors[name] = constant; + } + return tensors; +} + +ConstantMap read_safetensors(const std::filesystem::path& filename) { + auto safetensor = ov::read_tensor_data(filename); + + return safetensor_to_constant_map(safetensor); +} diff --git a/src/cpp/src/safe_tensor_wrapper.hpp b/src/cpp/src/safe_tensor_wrapper.hpp new file mode 100644 index 0000000000..9120bc93fc --- /dev/null +++ b/src/cpp/src/safe_tensor_wrapper.hpp @@ -0,0 +1,30 @@ + +#include "openvino/runtime/core.hpp" +#include "openvino/op/constant.hpp" +extern "C" { + #include "safetensors.h" +} + +using namespace ov::op; +// Converts Safetensors element type to OV element type. Only part of the types are supported. +ov::element::Type safetensors_to_ov_element_type (int dtype); + +using ConstantMap = std::map>; + +// Safetensor file parser that deallocates temporary buffers automatically. +// Drop-in replacement for the third party safetensors_File struct. +struct AutoSafetensor: public safetensors_File { + ~AutoSafetensor () { + std::free(tensors); + std::free(metadata); + } +}; + +// The key in the map is a tensor name and the Constant uses a region of memory from the memory block. +// Each Constant holds a shared pointer to the block in the runtime info. +// The memory block will be deallocated when the last Constant is destroyed. +ConstantMap safetensor_to_constant_map(const ov::Tensor& safetensor); + +// Reads a file with a given filename expecting Safetensors file format. +// The file data is mmaped to tensor. +ConstantMap read_safetensors(const std::filesystem::path& filename); \ No newline at end of file diff --git a/src/cpp/src/lora/safetensors.c b/src/cpp/src/safetensors.c similarity index 100% rename from src/cpp/src/lora/safetensors.c rename to src/cpp/src/safetensors.c diff --git a/src/cpp/src/sampling/sampler.cpp b/src/cpp/src/sampling/sampler.cpp index f34a8e251f..2ff59c024d 100644 --- a/src/cpp/src/sampling/sampler.cpp +++ b/src/cpp/src/sampling/sampler.cpp @@ -493,18 +493,6 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, } } -Logits Sampler::_get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t token_idx) { - ov::Shape logits_shape = logits.get_shape(); - size_t batch_size = logits_shape[0], seq_len = logits_shape[1], vocab_size = logits_shape[2]; - OPENVINO_ASSERT(batch_idx <= batch_size); - OPENVINO_ASSERT(token_idx < seq_len); - size_t batch_offset = batch_idx * seq_len * vocab_size; - size_t sequence_offset = (seq_len - token_idx - 1) * vocab_size; - float* logits_data = logits.data() + batch_offset + sequence_offset; - - return Logits{logits_data, vocab_size}; -} - Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const { // For greedy sampling we do not expect sorting or shrinking considered tokens // so we can operate directly on the data buffer @@ -682,6 +670,880 @@ align_all_sequence_len(SequenceGroup::Ptr& sequence_group, logit_processor.update_generated_len(min_generated_tokens); } +void pad_sequence_lengths(SequenceGroup::Ptr& sequence_group) { + auto running_sequences = sequence_group->get_running_sequences(); + if (running_sequences.empty()) { + return; + } + + size_t max_length = 0; + for (const auto& seq : running_sequences) { + max_length = std::max(max_length, seq->get_generated_ids().size()); + } + + for (auto& seq : running_sequences) { + while (seq->get_generated_ids().size() < max_length) { + seq->append_token(-1, 0.0f); + } + } +} + +void adjust_sequence_to_match_path(Sequence::Ptr sequence, + const std::vector& target_path, + size_t common_prefix_len) { + /*const auto& current_generated_ids = sequence->get_generated_ids(); + + if (current_generated_ids.size() > target_path.size()) { + size_t tokens_to_remove = current_generated_ids.size() - target_path.size(); + sequence->remove_last_tokens(tokens_to_remove); + } + + if (target_path.size() > current_generated_ids.size()) { + for (size_t i = current_generated_ids.size(); i < target_path.size(); ++i) { + sequence->append_token(target_path[i], 0.0f); + } + } + + for (size_t i = common_prefix_len; i < target_path.size(); ++i) { + if (i < current_generated_ids.size()) { + sequence->update_generated_token(i, target_path[i]); + } else { + sequence->append_token(target_path[i], 0.0f); + } + }*/ +} +Token Sampler::_greedy_sample_with_batch_idx(ov::Tensor logits, size_t batch_idx, size_t token_idx) { + auto logit = _get_logit_vector(logits, batch_idx, token_idx); + return _greedy_sample(logit, 0); +} + +Eagle2ValidationResult Sampler::validate_eagle2_tree(const std::vector>& candidate_paths, + const std::vector>& candidate_log_probs, + const std::vector beam_id, + const ov::Tensor& main_model_logits, + LogitProcessor& logit_processor, + bool do_sample) { + Eagle2ValidationResult result; + + if (candidate_paths.empty()) { + return result; + } + auto logit_shape = main_model_logits.get_shape(); +#if 1 // optimize branch + std::map main_results; + size_t seq_idx = 0; + for (size_t b = 0; b < logit_shape[0]; b++) { + main_results[b] = 0; + } + + size_t max_count = 0; + size_t max_row = 0; + for (size_t s = logit_shape[1] -1; s > 0; --s) { + for (auto iter = main_results.begin(); iter != main_results.end();) { + size_t b = iter->first; + auto sample_token = _greedy_sample_with_batch_idx(main_model_logits, b, s); + if (sample_token.m_index != candidate_paths[b][logit_shape[1] - s - 1]) { + iter = main_results.erase(iter); + } else { + iter->second++; + iter++; + } + } + if (!main_results.empty()) { + max_row = main_results.begin()->first; + max_count = main_results.begin()->second; + } else { + break; + } + } + + result.accepted_path_length = max_count; + auto num_tokens_to_process = candidate_paths[0].size(); + result.extra_sampled_token = _greedy_sample_with_batch_idx(main_model_logits, beam_id[max_row], (num_tokens_to_process - max_count)); + result.accepted_path_id = beam_id[max_row]; + result.is_path_accepted = (result.accepted_path_length > 0); +#else + // find the tokens with maximum probability in the main model logits + std::vector> main_results(logit_shape[0], std::vector(0)); + for (size_t b = 0; b < logit_shape[0]; ++b) { + for (size_t s = logit_shape[1] - 1; s > 0; --s) { + main_results[b].push_back(_greedy_sample_with_batch_idx(main_model_logits, b, s).m_index); + } + } + auto create_validate_mask = [] ( + const std::vector>& a, + const std::vector>& b) { + + std::vector> mask(a.size()); + + for (size_t i = 0; i < a.size(); ++i) { + if (a[i].size() != b[i].size()) { + throw std::invalid_argument("Rows must have the same number of columns"); + } + + mask[i].resize(a[i].size()); + + for (size_t j = 0; j < a[i].size(); ++j) { + mask[i][j] = (a[i][j] == b[i][j]) ? 1 : 0; + } + } + + return mask; + }; + size_t max_count = 0; + size_t max_row = 0; + auto mask = create_validate_mask(candidate_paths, main_results); + for (size_t i = 0; i < mask.size(); ++i) { + size_t count = 0; + for (bool value : mask[i]) { + if (value) { + count++; + } else { + break; + } + } + if (count > max_count) { + max_count = count; // number of max accepted + max_row = i; // best beam id + } + } + result.accepted_path_length = max_count; + auto num_tokens_to_process = candidate_paths[0].size(); + result.extra_sampled_token = _greedy_sample_with_batch_idx(main_model_logits, beam_id[max_row], (num_tokens_to_process - max_count)); + result.accepted_path_id = beam_id[max_row]; + result.is_path_accepted = (result.accepted_path_length > 0); +#endif + return result; + // Find the longest common prefix among all candidate paths + /*size_t max_common_prefix = 0; + if (candidate_paths.size() > 1) { + max_common_prefix = find_common_prefix_length(candidate_paths); // the first one is generated from main + } else { + max_common_prefix = candidate_paths[0].size(); + } + auto num_tokens_to_process = candidate_paths[0].size(); + Logits sample_p(nullptr, 0); + // Validate tokens position by position + for (size_t pos = 0; pos < max_common_prefix; ++pos) { + // All paths should have the same token at this position due to common prefix + int64_t candidate_token = candidate_paths[0][pos]; + float candidate_log_prob = candidate_log_probs[0][pos]; + + // Get main model logits for this position + auto logit_vector = _get_logit_vector(main_model_logits, 0, num_tokens_to_process - pos); // since this token is common in all candidates, sample from first beam + logit_processor.apply(logit_vector); + sample_p = logit_vector; // save the logits for further sampling + // Find the candidate token in main model distribution + auto token_prob_pair = find_token_probability(logit_vector, candidate_token); + if (!token_prob_pair.first) { + // Token not found in main model distribution + result.rejected_at_position = pos; + break; + } + + float main_model_log_prob = token_prob_pair.second; + + // Apply acceptance criteria + bool is_accepted = false; + if (do_sample) { + // Speculative sampling: probability ratio test + float main_model_prob = std::exp(main_model_log_prob); + float draft_model_prob = 1.0; //std::exp(candidate_log_prob); + float probability_ratio = std::min(1.0f, main_model_prob / draft_model_prob); + + auto dist = std::uniform_real_distribution(0.0f, 1.0f); + float r = dist(rng_engine); + is_accepted = r <= probability_ratio; + } else { + // Greedy validation: exact token match with highest probability + auto highest_prob_token = get_highest_probability_token(logit_vector); + is_accepted = (candidate_token == highest_prob_token.m_index); + } + + if (is_accepted) { + result.accepted_tokens.push_back(candidate_token); + result.updated_log_probs.push_back(main_model_log_prob); + result.accepted_path_length++; + } else { + result.rejected_at_position = pos; + break; + } + } + size_t best_path_idx = 0; + // If we validated the common prefix successfully, try to extend with one of the paths + if (result.accepted_path_length == max_common_prefix && candidate_paths.size() > 1) { + // Select the best path to continue validation beyond common prefix + // at this point, we either have sample_p for common prefix of beam 0 or none + best_path_idx = select_best_continuation_path(candidate_paths, + candidate_log_probs, + beam_id, + max_common_prefix, + sample_p, + main_model_logits, + logit_processor); + + // Continue validation on the selected path + const auto& selected_path = candidate_paths[best_path_idx]; + const auto& selected_log_probs = candidate_log_probs[best_path_idx]; + + for (size_t pos = max_common_prefix; pos < selected_path.size(); ++pos) { + int64_t candidate_token = selected_path[pos]; + float candidate_log_prob = selected_log_probs[pos]; + + auto logit_vector = _get_logit_vector(main_model_logits, beam_id[best_path_idx], num_tokens_to_process - pos); + logit_processor.apply(logit_vector); + auto token_prob_pair = find_token_probability(logit_vector, candidate_token); + if (!token_prob_pair.first) { + result.rejected_at_position = pos; + sample_p = logit_vector; + break; + } + + float main_model_log_prob = token_prob_pair.second; + + bool is_accepted = false; + if (do_sample) { + float main_model_prob = std::exp(main_model_log_prob); + float draft_model_prob = 1.0; //std::exp(candidate_log_prob); + float probability_ratio = std::min(1.0f, main_model_prob / draft_model_prob); + + auto dist = std::uniform_real_distribution(0.0f, 1.0f); + float r = dist(rng_engine); + is_accepted = r <= probability_ratio; + } else { + auto highest_prob_token = get_highest_probability_token(logit_vector); + is_accepted = (candidate_token == highest_prob_token.m_index); + } + + if (is_accepted) { + result.accepted_path_id = beam_id[best_path_idx]; + result.accepted_tokens.push_back(candidate_token); + result.updated_log_probs.push_back(main_model_log_prob); + result.accepted_path_length++; + } else { + result.rejected_at_position = pos; + sample_p = logit_vector; + // reference code update gtp of candidate token to 0? + break; + } + } + } + + // Generate one additional token if no rejection occurred + if (result.accepted_path_length == num_tokens_to_process) { + size_t next_pos = result.accepted_path_length; + auto logit_vector = _get_logit_vector(main_model_logits, beam_id[best_path_idx], 0); + logit_processor.apply(logit_vector); + + if (do_sample) { + auto sampled_tokens = _multinomial_sample(logit_vector, 1); + result.extra_sampled_token = sampled_tokens[0]; + } else { + result.extra_sampled_token = _greedy_sample(logit_vector, 0); + } + } else if (sample_p.m_data) { + if (do_sample) { + auto sampled_tokens = _multinomial_sample(sample_p, 1); + result.extra_sampled_token = sampled_tokens[0]; + } else { + result.extra_sampled_token = _greedy_sample(sample_p, 0); + } + } else { + OPENVINO_THROW("should not reach here"); + } + result.accepted_path_id = beam_id[best_path_idx]; + result.is_path_accepted = (result.accepted_path_length > 0); + return result;*/ +} + +// Helper function to find common prefix length among multiple paths +size_t Sampler::find_common_prefix_length(const std::vector>& paths) { + if (paths.empty()) + return 0; + + size_t min_length = paths[0].size(); + for (const auto& path : paths) { + min_length = std::min(min_length, path.size()); + } + + size_t common_length = 0; + for (size_t pos = 0; pos < min_length; ++pos) { + int64_t first_token = paths[0][pos]; + bool all_match = true; + + for (size_t path_idx = 1; path_idx < paths.size(); ++path_idx) { + if (paths[path_idx][pos] != first_token) { + all_match = false; + break; + } + } + + if (all_match) { + common_length++; + } else { + break; + } + } + + return common_length; +} + +// Helper function to find token probability in logit distribution +std::pair Sampler::find_token_probability(const Logits& logits, int64_t token_id) { + if (logits.is_vector_initialized()) { + for (const auto& token : logits.m_vector) { + if (token.m_index == token_id) { + return {true, token.m_log_prob}; + } + } + } else { + if (token_id >= 0 && token_id < static_cast(logits.m_size)) { + // Apply log softmax to get proper log probability + float max_logit = *std::max_element(logits.m_data, logits.m_data + logits.m_size); + float log_sum = std::log(std::accumulate(logits.m_data, + logits.m_data + logits.m_size, + 0.0f, + [max_logit](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_logit); + })); + float log_prob = logits.m_data[token_id] - max_logit - log_sum; + return {true, log_prob}; + } + } + return {false, 0.0f}; +} + +// Helper function to get highest probability token +Token Sampler::get_highest_probability_token(const Logits& logits) { + if (logits.is_vector_initialized()) { + auto max_it = + std::max_element(logits.m_vector.begin(), logits.m_vector.end(), [](const Token& a, const Token& b) { + return a.m_log_prob < b.m_log_prob; + }); + return *max_it; + } else { + auto max_it = std::max_element(logits.m_data, logits.m_data + logits.m_size); + size_t max_idx = std::distance(logits.m_data, max_it); + + // Apply log softmax + float max_logit = *max_it; + float log_sum = std::log(std::accumulate(logits.m_data, + logits.m_data + logits.m_size, + 0.0f, + [max_logit](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_logit); + })); + float log_prob = max_logit - max_logit - log_sum; + + return Token(log_prob, max_idx); + } +} + +// Helper function to select best continuation path beyond common prefix +size_t Sampler::select_best_continuation_path(const std::vector>& candidate_paths, + const std::vector>& candidate_log_probs, + const std::vector& beam_id, + const size_t& common_prefix_length, + Logits& logits, + const ov::Tensor& main_model_logits, + LogitProcessor& logit_processor) { + if (candidate_paths.size() <= 1) + return 0; + + float best_score = -std::numeric_limits::infinity(); + size_t best_path_idx = 0; + auto num_tokens_to_process = candidate_paths[0].size(); + for (size_t path_idx = 0; path_idx < candidate_paths.size(); ++path_idx) { + const auto& path = candidate_paths[path_idx]; + const auto& log_probs = candidate_log_probs[path_idx]; + + if (path.size() <= common_prefix_length) + continue; + + // Score based on next token probability from main model + int64_t next_token = path[common_prefix_length]; + Logits logit_vector(nullptr, 0); + if (beam_id[path_idx] == 0 && logits.m_data) { + logit_vector = logits; // use logits from main model for beam 0 + } else { + logit_vector = _get_logit_vector(main_model_logits, beam_id[path_idx] , num_tokens_to_process - common_prefix_length); + logit_processor.apply(logit_vector); + } + + auto token_prob_pair = find_token_probability(logit_vector, next_token); + if (token_prob_pair.first) { + float score = token_prob_pair.second; // Use main model log probability as score + if (score > best_score) { + best_score = score; + best_path_idx = path_idx; + } + } + } + + return best_path_idx; +} +// Enhanced validation function for EAGLE2 that integrates with existing sampler +int Sampler::validate_eagle2_candidates(SequenceGroup::Ptr seq_group, + const ov::Tensor& main_model_logits, + LogitProcessor& logit_processor, + size_t& generated_tokens_count, + size_t& max_removed_tokens, + size_t& num_tokens_to_process, + bool do_sample) { + std::vector> candidate_tokens; + std::vector> candidate_log_probs; + std::vector beam_idxs; + for (auto& running_sequence : seq_group->get_running_sequences()) { + auto generated_ids = running_sequence->get_generated_ids(); + size_t start_idx = generated_ids.size() > num_tokens_to_process ? generated_ids.size() - num_tokens_to_process : 0; + // Extract the tokens to validate + std::vector tokens_to_validate(generated_ids.begin() + start_idx, generated_ids.end()); + std::vector log_probs_to_validate(running_sequence->get_generated_log_probs().begin() + start_idx, + running_sequence->get_generated_log_probs().end()); + candidate_tokens.push_back(tokens_to_validate); + candidate_log_probs.push_back(log_probs_to_validate); + beam_idxs.push_back([&] (uint64_t seq_id) -> size_t { + std::vector running_seqs = seq_group->get_running_sequences(); + for (size_t seq_global_index = 0; seq_global_index < running_seqs.size(); ++seq_global_index) { + if (seq_id == running_seqs[seq_global_index]->get_id()) + return seq_global_index; + } + OPENVINO_THROW("should not be here"); + } (running_sequence->get_id())); + } + // std::cout << "candidate_tokens number : " << candidate_tokens.size() << "candidate token length: " << candidate_tokens[0].size() <get_request_id() << " accepted: " << validation_result.accepted_path_length << std::endl; + + if (!validation_result.is_path_accepted) { + // return false; + // nothing passed validation, to be further handled, shall we stop the draft pipeline? + } + + auto selected_sequence = seq_group->get_running_sequences()[validation_result.accepted_path_id]; + // update accepted sequence with the validated tokens + + // Remove any existing generated tokens that weren't accepted + size_t current_generated_len = selected_sequence->get_generated_len(); + auto num_tokens_to_validate_org = seq_group->get_num_tokens_to_validate(); + auto start_idx = current_generated_len > num_tokens_to_process ? current_generated_len - num_tokens_to_process : 0; + const auto generated_token_ids = selected_sequence->get_generated_ids(); + if (num_tokens_to_process > validation_result.accepted_path_length) { + size_t tokens_to_remove = num_tokens_to_process - validation_result.accepted_path_length; + selected_sequence->remove_last_tokens(tokens_to_remove); + //logit_processor.update_generated_len(current_generated_len - tokens_to_remove); + /*for (size_t i = validation_result.accepted_path_length; i < current_generated_len; ++i) { + logit_processor.decrease_generated_token_occurance(generated_token_ids[i]); + }*/ + max_removed_tokens = std::max(max_removed_tokens, tokens_to_remove); + } else if (num_tokens_to_process < num_tokens_to_validate_org) { + // when validation scheduling is limited due to lack of kv block or max batch size limitation + size_t tokens_to_remove = num_tokens_to_validate_org - validation_result.accepted_path_length; + selected_sequence->remove_last_tokens(tokens_to_remove); + // fill in the correct max_removed_tokens, which is the removed token in validation stage only + max_removed_tokens = std::max(max_removed_tokens, num_tokens_to_process - validation_result.accepted_path_length); + } + // Add the bonus token with updated probabilities + selected_sequence->append_token(validation_result.extra_sampled_token.m_index, validation_result.extra_sampled_token.m_log_prob); + logit_processor.register_new_generated_token(validation_result.extra_sampled_token.m_index); + generated_tokens_count++; + return validation_result.accepted_path_id; +} + +// need to clear the selector after the request is finished +void Sampler::clear_top_k_selector(uint64_t request_id) { + auto it = m_top_k_selector_info.find(request_id); + if (it != m_top_k_selector_info.end()) { + m_top_k_selector_info.erase(it); + } +} + +Sampler::TopKSelector::TopKSelector(SequenceGroup::Ptr sequence_group, ov::Tensor d2t) + : m_sequence_group(sequence_group), + m_parameters{m_sequence_group->get_sampling_parameters()}, + m_d2t(d2t? d2t.data() : nullptr) { + OPENVINO_ASSERT(m_sequence_group->num_running_seqs() == 1); // for eagle, support 1 running seq at the very beginning + tree_reset(m_sequence_group); +} + +void Sampler::TopKSelector::tree_reset(SequenceGroup::Ptr& sequence_group) { + m_beams.reserve(m_parameters.eagle_tree_params.branching_factor); + Beam root_beam((*m_sequence_group)[0]); + root_beam.m_score = 0.0f; + m_eagle2_candidate_graph = std::make_shared(root_beam, + m_parameters.eagle_tree_params.total_tokens - 1, + m_parameters.eagle_tree_params.tree_depth); + m_beams.push_back(root_beam); + +} + +void Sampler::TopKSelector::finalize_eagle2_candidates(SamplerOutput& sampler_output) { + auto final_candidates = + m_eagle2_candidate_graph->get_top_k_candidates(); // currently draft model output wrong candidates + auto leaf_nodes = m_eagle2_candidate_graph->get_leaf_nodes_from_candidates(final_candidates); + std::vector> retrieve_indices; + retrieve_indices.reserve(leaf_nodes.size()); + std::vector child_beams; + std::map parent_2_num_childs_map; + for (const Beam& leaf : leaf_nodes) { + if (leaf.m_tree_layer == m_parameters.eagle_tree_params.tree_depth + 1) { + parent_2_num_childs_map[leaf.m_sequence->get_id()] += 1; + child_beams.push_back(leaf); // update fork info when needed, at this point, the first and last layer are ready + } else { + // Get the path from root to this leaf + std::vector path = m_eagle2_candidate_graph->get_path_to_node(leaf.m_node_id); + retrieve_indices.push_back(path); + } + } + for (Beam& child_beam : child_beams) { + uint64_t parent_sequence_id = child_beam.m_sequence->get_id(); + uint64_t& num_childs = parent_2_num_childs_map[parent_sequence_id]; + + // if current beam is forked multiple times + if (num_childs > 1) { + child_beam.m_sequence = m_sequence_group->fork_sequence(child_beam.m_sequence); + child_beam.m_sequence->set_status(SequenceStatus::RUNNING); + child_beam.m_sequence->append_token(child_beam.m_token_id, child_beam.m_log_prob); + + // reduce forks count, since fork already happened and next loop iteration + // will go by the second branch (num_childs == 1) + --num_childs; + + // fill out sampler output + sampler_output.m_forked_sequences[parent_sequence_id].push_back(child_beam.m_sequence->get_id()); + } else if (num_childs == 1) { + // keep current sequence going and add a new token + child_beam.m_sequence->set_status(SequenceStatus::RUNNING); + child_beam.m_sequence->append_token(child_beam.m_token_id, child_beam.m_log_prob); + } + } + + // now we have all leaf nodes and their paths, we can update sequences + // search for existing sequences in sequence group + auto all_sequences = m_sequence_group->get_caching_sequences(); + std::vector available_sequences = all_sequences; + std::vector used_sequences; + + std::set used_sequence_ids; + std::vector> remaining_retrieve_indices; + auto max_num_generated_token = m_sequence_group->get_num_processed_tokens() - m_sequence_group->get_prompt_len() + 1; + for (size_t i = 0; i < retrieve_indices.size(); ++i) { + const std::vector& path = retrieve_indices[i]; + bool found_exact_match = false; + + for (auto it = available_sequences.begin(); it != available_sequences.end(); ++it) { + Sequence::Ptr seq = *it; + + if (used_sequence_ids.count(seq->get_id())) { + continue; + } + const auto& generated_ids = seq->get_generated_ids(); + if (generated_ids.size() < path.size()) { + continue; // cannot match if generated ids are shorter than path + } + auto adjust_len = max_num_generated_token - seq->get_generated_len(); + int start_idx = generated_ids.size() - (m_parameters.eagle_tree_params.tree_depth) + adjust_len; + if (start_idx < 0) { + start_idx = 0; // ensure we do not go out of bounds + } + auto iter = std::search(generated_ids.begin() + start_idx, generated_ids.end(), path.begin(), path.end()); + if (iter != generated_ids.end()) { + size_t tokens_to_remove = std::distance(iter, generated_ids.end()) - path.size(); + seq->remove_last_tokens(tokens_to_remove); + seq->set_status(SequenceStatus::RUNNING); + used_sequences.push_back(seq); + used_sequence_ids.insert(seq->get_id()); + + available_sequences.erase(it); + found_exact_match = true; + break; + } + } + + if (!found_exact_match) { + remaining_retrieve_indices.push_back(path); + } + } + + std::map child_2_parent_map;; + for (int i = 0; i < remaining_retrieve_indices.size(); ++i) { + const auto& path = remaining_retrieve_indices[i]; + // find best matching sequence in the remaining caching sequences + Sequence::Ptr best_match = nullptr; + size_t max_common_length = 0; + size_t best_match_start_pos = 0; + for (auto& seq : available_sequences) { + const auto& generated_ids = seq->get_generated_ids(); + if (generated_ids.empty()) { + continue; + } + auto adjust_len = max_num_generated_token - seq->get_generated_len(); + int start_idx = generated_ids.size() - (m_parameters.eagle_tree_params.tree_depth) + adjust_len; + if (start_idx < 0) { + start_idx = 0; // ensure we do not go out of bounds + } + for (size_t start_pos = start_idx; start_pos <= generated_ids.size() - 1; ++start_pos) { + size_t common_length = 0; + + for (size_t i = 0; i < path.size() && (start_pos + i) < generated_ids.size(); ++i) { + if (path[i] == generated_ids[start_pos + i]) { + common_length++; + } else { + break; // Break on first mismatch (we want contiguous matches) + } + } + + // If this is the best match so far, update our tracking variables + if (common_length > max_common_length) { + max_common_length = common_length; + best_match = seq; + best_match_start_pos = start_pos; + } + } + } + auto need_to_fork = [&] () { + // check the remaining_retrive_indices from i+1 to find if same common prefix exists + for (size_t j = i + 1; j < remaining_retrieve_indices.size(); ++j) { + const auto& next_path = remaining_retrieve_indices[j]; + if (next_path.size() > max_common_length && + std::equal(path.begin(), path.begin() + max_common_length, next_path.begin())) { + return true; + } + } + return false; + }; + if (best_match && max_common_length > 0) { + if (need_to_fork()) { + // Fork the sequence if needed + // loop the child_2_parent_map to find the very parent + auto forked_sequence = m_sequence_group->fork_sequence(best_match); + available_sequences.push_back(forked_sequence); + auto parent_id = best_match->get_id(); + child_2_parent_map[forked_sequence->get_id()] = parent_id; + while (child_2_parent_map.find(parent_id) != child_2_parent_map.end()) { + parent_id = child_2_parent_map[parent_id]; + } + sampler_output.m_forked_sequences[parent_id].push_back(forked_sequence->get_id()); + } + // Mark the sequence as running and used + best_match->set_status(SequenceStatus::RUNNING); + used_sequences.push_back(best_match); + used_sequence_ids.insert(best_match->get_id()); + // Remove this sequence from available sequences + available_sequences.erase(std::remove(available_sequences.begin(), + available_sequences.end(), + best_match), + available_sequences.end()); + + + // For now, let's just ensure the token sequence matches the path + // by removing any extra tokens and adding missing ones + size_t current_length = best_match->get_generated_ids().size(); + + // Remove tokens if needed (if sequence is longer than path) + if (current_length - best_match_start_pos > path.size()) { + size_t tokens_to_remove = current_length - path.size(); + best_match->remove_last_tokens(tokens_to_remove); + } + + // append tokens to match the path + for (size_t i = 0; i < path.size() - max_common_length; ++i) { + // Append missing tokens + best_match->append_token(path[i + max_common_length], 0.0f); // Using 0.0f as default log_prob, as I do not care the prob later + } + } else { + std::cout << "No matching sequence found for a path of length " << path.size() << std::endl; + } + } + + pad_sequence_lengths(m_sequence_group); + // drop all waiting sequences + auto seqs = m_sequence_group->get_sequences(); + for (auto& seq : seqs) { + if (seq->is_caching()) { // remaining cached sequences can be now released + sampler_output.m_dropped_sequences.push_back(seq->get_id()); + m_sequence_group->remove_sequence(seq->get_id()); + } + } +} + +void Sampler::TopKSelector::select_top_k(const ov::Tensor& logits, SamplerOutput& sampler_output) { + // parent sequence ID -> number of child sequences + std::map parent_2_num_childs_map; + ov::Shape shape = logits.get_shape(); + OPENVINO_ASSERT(shape.size() == 3); + size_t batch = shape[0], seq_len = shape[1], vocab_size = shape[2]; + + if (m_tree_layer_counter == 0 && m_beams.empty()) { + tree_reset(m_sequence_group); + } + + for (Beam& beam : m_beams) { + sampler_output.num_generated_tokens++; + uint64_t parent_seq_id = beam.m_sequence->get_id(); + + // here we need to map index of sequence in beam search group(s) and sequence group + beam.m_global_beam_idx = [this](uint64_t seq_id) -> size_t { + std::vector running_seqs = m_sequence_group->get_running_sequences(); + for (size_t seq_global_index = 0; seq_global_index < running_seqs.size(); ++seq_global_index) { + if (seq_id == running_seqs[seq_global_index]->get_id()) + return seq_global_index; + } + OPENVINO_THROW("Internal error in beam search: should not be here"); + }(parent_seq_id); + OPENVINO_ASSERT(beam.m_global_beam_idx < batch, "Logits batch size doesn't match the number of beams"); + + // zero out all parent forks counts + parent_2_num_childs_map[parent_seq_id] = 0; + } + + std::vector candidates; + std::vector child_beams; // beams for next execution in step() + candidates.reserve(m_parameters.eagle_tree_params.branching_factor * m_beams.size()); // num_beams for each beam + m_tree_layer_counter++; + for (const Beam& beam : m_beams) { +#if 1 // optimize branch + size_t batch_offset = beam.m_global_beam_idx * seq_len * vocab_size; + size_t sequence_offset = (seq_len - 1) * vocab_size; + const float* beam_logits = logits.data() + batch_offset + sequence_offset; + float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size); + float log_sum = std::log(std::accumulate( + beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_logit); + })); + + // sort and find the topK + using Pair = std::pair; + auto cmp = [](const Pair& a, const Pair& b) { return a.first > b.first; }; + std::priority_queue, decltype(cmp)> minHeap(cmp); + + for (size_t i = 0; i < vocab_size; ++i) { + if (minHeap.size() < m_parameters.eagle_tree_params.branching_factor) { + minHeap.emplace(beam_logits[i], i); + } else if (beam_logits[i] > minHeap.top().first) { + minHeap.pop(); + minHeap.emplace(beam_logits[i], i); + } + } + // output topK of the logits (Ascending order) + std::vector result; + while (!minHeap.empty()) { + result.push_back(minHeap.top()); + minHeap.pop(); + } + // calculate topK's log_prob and token_id + for (auto it = result.rbegin(); it != result.rend(); ++it) { + Beam new_candidate = beam; + new_candidate.m_log_prob = it->first - max_logit - log_sum; + new_candidate.m_score += new_candidate.m_log_prob; + new_candidate.m_token_id = (it->second + (m_d2t? m_d2t[it->second] : 0)); + m_eagle2_candidate_graph->add_candidate(new_candidate, beam.m_node_id); + candidates.push_back(new_candidate); + } +#else + // do not need log softmax to match paper? + std::vector tokens = log_softmax(logits, beam.m_global_beam_idx); + + // sort tokens + std::partial_sort(tokens.begin(), + tokens.begin() + m_parameters.eagle_tree_params.branching_factor, + tokens.end(), + [](const Token& a, const Token& b) { + return a.m_log_prob > b.m_log_prob; + }); + + size_t add_count = 0; + for (Token token : tokens) { + Beam new_candidate = beam; + new_candidate.m_score += new_candidate.m_log_prob = token.m_log_prob; + new_candidate.m_token_id = (token.m_index + (m_d2t? m_d2t[token.m_index] : 0)); + m_eagle2_candidate_graph->add_candidate(new_candidate, beam.m_node_id); + candidates.push_back(new_candidate); + if (++add_count == m_parameters.eagle_tree_params.branching_factor) { + break; + } + } +#endif + } + + // Sample 2 * group_size highest score tokens to get at least 1 non EOS token per beam + // OPENVINO_ASSERT(candidates.size() >= 2 * group_size, "No beams left to search"); + + std::partial_sort(candidates.begin(), + candidates.begin() + m_parameters.eagle_tree_params.branching_factor, + candidates.end(), + greater); // select top k of cumulative probs + // leave the last cycle of beam selection to candidate finalization stage + if (m_tree_layer_counter < m_parameters.eagle_tree_params.tree_depth + 1) { + for (size_t cand_idx = 0; cand_idx < m_parameters.eagle_tree_params.branching_factor; ++cand_idx) { + Beam& candidate = candidates[cand_idx]; + + parent_2_num_childs_map[candidate.m_sequence->get_id()] += 1; + child_beams.push_back(candidate); // select top beams + } + + for (Beam& child_beam : child_beams) { + uint64_t parent_sequence_id = child_beam.m_sequence->get_id(); + uint64_t& num_childs = parent_2_num_childs_map[parent_sequence_id]; + + // if current beam is forked multiple times + if (num_childs > 1) { + child_beam.m_sequence = m_sequence_group->fork_sequence(child_beam.m_sequence); + child_beam.m_sequence->append_token(child_beam.m_token_id, child_beam.m_log_prob); + + // reduce forks count, since fork already happened and next loop iteration + // will go by the second branch (num_childs == 1) + --num_childs; + + // fill out sampler output + sampler_output.m_forked_sequences[parent_sequence_id].push_back(child_beam.m_sequence->get_id()); + } else if (num_childs == 1) { + // keep current sequence going and add a new token + child_beam.m_sequence->append_token(child_beam.m_token_id, child_beam.m_log_prob); + } + } + + //drop beams which are de-selected during top-k selection + for (const Beam& beam : m_beams) { + size_t num_childs = parent_2_num_childs_map[beam.m_sequence->get_id()]; + if (num_childs == 0) { + // do not drop, keep for further trace back + beam.m_sequence->set_status(SequenceStatus::CACHING); + } + } + + // child become parents + m_beams = child_beams; + } else { // at this point, we already have the full candidate tree + // now we start the finalization of candidates and last cycle of beam selection and sequence forking + for (auto& iter : m_sequence_group->get_running_sequences()) { // at this point, we should have running sequence num = branching factor + iter->set_status(SequenceStatus::CACHING); + } + finalize_eagle2_candidates(sampler_output); + m_tree_layer_counter = 0; // reset counter + m_beams.clear(); + return; + } +} + +Logits Sampler::_get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t token_idx) { + ov::Shape logits_shape = logits.get_shape(); + size_t batch_size = logits_shape[0], seq_len = logits_shape[1], vocab_size = logits_shape[2]; + OPENVINO_ASSERT(batch_idx <= batch_size); + OPENVINO_ASSERT(token_idx < seq_len); + size_t batch_offset = batch_idx * seq_len * vocab_size; + size_t sequence_offset = (seq_len - token_idx - 1) * vocab_size; + float* logits_data = logits.data() + batch_offset + sequence_offset; + + return Logits{logits_data, vocab_size}; +} + bool Sampler::validate_candidate( Sequence::Ptr running_sequence, size_t& token_idx, @@ -791,105 +1653,172 @@ SequenceGroupSamplingInfo Sampler::sample_from_sequence_group(SequenceGroup::Ptr if (sampling_params.is_greedy_decoding() || sampling_params.is_multinomial()) { std::vector running_sequences = sequence_group->get_running_sequences(); size_t num_running_sequences = sequence_group->num_running_seqs(); - if (sampling_params.is_greedy_decoding()) { + if (sampling_params.is_greedy_decoding() && sequence_group->get_num_tokens_to_validate() == 0) { OPENVINO_ASSERT(num_running_sequences == 1); } - for (size_t running_sequence_id = 0; running_sequence_id < num_running_sequences; ++running_sequence_id) { - auto& running_sequence = running_sequences[running_sequence_id]; - bool is_validation_passed = true; - // make `num_tokens_to_process` iteration to validate a candidate generated by `draft_model` + 1 iteration to generate one more token by `main_model` - for (size_t i = 0; i <= num_tokens_to_process; ++i) { - if (running_sequence->has_finished()) - break; - sg_sampling_info.sampler_output.num_generated_tokens++; - // calculate token offset from the end of logit - size_t logit_token_offset = num_tokens_to_process - i; - size_t generated_seq_token_offset = num_generated_tokens_to_validate - i; - // max counter of needed to be sampled tokens - OPENVINO_ASSERT(running_sequence->get_generated_len() >= generated_seq_token_offset); - size_t generated_and_verified_len = running_sequence->get_generated_len() - generated_seq_token_offset; - OPENVINO_ASSERT(sequence_group->get_max_new_tokens() >= generated_and_verified_len); - size_t max_num_sampled_token = sequence_group->get_max_new_tokens() - generated_and_verified_len; - if (max_num_sampled_token == 0) { - stop_sample_tokens(running_sequence, generated_seq_token_offset, max_num_sampled_token, assisting_pipeline_info.max_removed_tokens_per_request); - break; - } - // do sampling only for token validation/generation. - // continue in case of extending draft model sequences by main model generated tokens which - // should be taken to KV cache without validation - if (!is_validation_mode_enabled && generated_seq_token_offset > 0) { - continue; + if (is_validation_mode_enabled && num_generated_tokens_to_validate > 0 ) { + // trigger group validation for eagle mode + auto selected_path = validate_eagle2_candidates(sequence_group, + sequence_group_logits, + logit_processor, + sg_sampling_info.sampler_output.num_generated_tokens, + assisting_pipeline_info.max_removed_tokens_per_request, + num_tokens_to_process, + sampling_params.do_sample); + // drop other sequences + auto running_sequences = sequence_group->get_running_sequences(); + for (size_t i = 0; i < running_sequences.size(); ++i) { + if (i != selected_path) { + auto& running_sequence = running_sequences[i]; + running_sequence->set_status(SequenceStatus::FINISHED); + running_sequence->set_finish_reason(GenerationFinishReason::STOP); + sg_sampling_info.sampler_output.m_dropped_sequences.push_back(running_sequence->get_id()); + sequence_group->remove_sequence(running_sequence->get_id()); } + } + assisting_pipeline_info.min_generated_len = std::min(assisting_pipeline_info.min_generated_len, sequence_group->get_running_sequences().front()->get_generated_len()); + auto sampling_params = sequence_group->get_sampling_parameters(); + auto running_sequence = sequence_group->get_running_sequences()[0]; + auto sampled_token = running_sequence->get_generated_ids().back(); + for (const auto& dropped_seq_id : _try_finish_generation(sequence_group)) { + sg_sampling_info.sampler_output.m_dropped_sequences.push_back(dropped_seq_id); + } + /*if (is_stop_token_id_hit(sampled_token, sampling_params.stop_token_ids) && + !sampling_params.ignore_eos) { + running_sequence->set_status(SequenceStatus::FINISHED); + running_sequence->set_finish_reason(GenerationFinishReason::STOP); + sg_sampling_info.sampler_output.m_dropped_sequences.push_back(running_sequence->get_id()); + }*/ + } else { + for (size_t running_sequence_id = 0; running_sequence_id < num_running_sequences; ++running_sequence_id) { + auto& running_sequence = running_sequences[running_sequence_id]; + bool is_validation_passed = true; + // make `num_tokens_to_process` iteration to validate a candidate generated by `draft_model` + 1 + // iteration to generate one more token by `main_model` + for (size_t i = 0; i <= num_tokens_to_process; ++i) { + if (running_sequence->has_finished()) + break; + sg_sampling_info.sampler_output.num_generated_tokens++; + // calculate token offset from the end of logit + size_t token_offset = num_tokens_to_process - i; + // max counter of needed to be sampled tokens + OPENVINO_ASSERT(running_sequence->get_generated_len() >= token_offset); + size_t generated_and_verified_len = running_sequence->get_generated_len() - token_offset; + OPENVINO_ASSERT(sequence_group->get_max_new_tokens() >= generated_and_verified_len); + size_t max_num_sampled_token = sequence_group->get_max_new_tokens() - generated_and_verified_len; + if (max_num_sampled_token == 0) { + stop_sample_tokens(running_sequence, + token_offset, + max_num_sampled_token, + assisting_pipeline_info.max_removed_tokens_per_request); + break; + } - auto logit_vector = _get_logit_vector(sequence_group_logits, running_sequence_id, logit_token_offset); - logit_processor.apply(logit_vector); - - Token sampled_token; - bool is_generate_n_tokens = false; - if (sampling_params.is_greedy_decoding()) { - sampled_token = { _greedy_sample(logit_vector, sampling_params.logprobs) }; - } else { - // is_multinomial() - is_generate_n_tokens = sequence_group->num_total_seqs() == 1; - const size_t num_tokens_per_sequence = is_generate_n_tokens ? sampling_params.num_return_sequences : 1; - is_generate_n_tokens &= (num_tokens_per_sequence > 1); - auto sampled_token_ids = _multinomial_sample(logit_vector, num_tokens_per_sequence); - OPENVINO_ASSERT(sampled_token_ids.size(), num_tokens_per_sequence); - // to create n sequence just in case of `sequence_group->num_total_seqs() == 1` and `sampling_params.num_return_sequences > 1` - if (is_generate_n_tokens) { - const auto forked_seq_ids = create_n_forked_sequences(sequence_group, logit_processor, sampled_token_ids); - sg_sampling_info.sampler_output.m_forked_sequences.insert({running_sequences[0]->get_id(), forked_seq_ids}); + // do sampling only for token validation/generation. + // continue in case of extending draft model sequences by main model generated tokens which + // should be taken to KV cache without validation + if (!is_validation_mode_enabled && token_offset > 0) { + continue; } - sampled_token = sampled_token_ids.front(); - // make `_speculative_sampling` in case of previous token was not accepted in speculative decoding - if (!is_validation_passed) { - float p_prime = get_p_prime(running_sequence, sampled_token, generated_seq_token_offset + 1); - assisting_pipeline_info.max_removed_tokens_per_request = std::max(assisting_pipeline_info.max_removed_tokens_per_request, generated_seq_token_offset); - // update prob only in case candidate prob > sampled token prob - if (p_prime > 0.f) { - auto prob = std::exp(sampled_token.m_log_prob); - prob /= p_prime; - sampled_token.m_log_prob = std::log(prob); + + auto logit_vector = _get_logit_vector(sequence_group_logits, running_sequence_id, token_offset); + logit_processor.apply(logit_vector); + + Token sampled_token; + bool is_generate_n_tokens = false; + if (sampling_params.is_greedy_decoding()) { + sampled_token = {_greedy_sample(logit_vector, sampling_params.logprobs)}; + } else { + // is_multinomial() + is_generate_n_tokens = sequence_group->num_total_seqs() == 1; + const size_t num_tokens_per_sequence = + is_generate_n_tokens ? sampling_params.num_return_sequences : 1; + is_generate_n_tokens &= (num_tokens_per_sequence > 1); + auto sampled_token_ids = _multinomial_sample(logit_vector, num_tokens_per_sequence); + OPENVINO_ASSERT(sampled_token_ids.size(), num_tokens_per_sequence); + // to create n sequence just in case of `sequence_group->num_total_seqs() == 1` and + // `sampling_params.num_return_sequences > 1` + if (is_generate_n_tokens) { + const auto forked_seq_ids = + create_n_forked_sequences(sequence_group, logit_processor, sampled_token_ids); + sg_sampling_info.sampler_output.m_forked_sequences.insert( + {running_sequences[0]->get_id(), forked_seq_ids}); + } + sampled_token = sampled_token_ids.front(); + // make `_speculative_sampling` in case of previous token was not accepted in speculative + // decoding + if (!is_validation_passed) { + float p_prime = get_p_prime(running_sequence, sampled_token, token_offset + 1); + assisting_pipeline_info.max_removed_tokens_per_request = + std::max(assisting_pipeline_info.max_removed_tokens_per_request, token_offset); + // update prob only in case candidate prob > sampled token prob + if (p_prime > 0.f) { + auto prob = std::exp(sampled_token.m_log_prob); + prob /= p_prime; + sampled_token.m_log_prob = std::log(prob); + } } } - } - // flag to add sampled token to generated sequence or extend logit processors only - bool is_extend_sequence = logit_token_offset == 0 || is_generate_n_tokens || !is_validation_passed; - if (is_validation_mode_enabled && !is_extend_sequence) { - is_validation_passed = validate_candidate(running_sequences[running_sequence_id], generated_seq_token_offset, - sampled_token, is_extend_sequence, assisting_pipeline_info.max_removed_tokens_per_request, - sampling_params.do_sample, !sampling_params.is_prompt_lookup()); - - // doing resample in case of non accepted tokens in speculative sampling - if (!is_validation_passed && sampling_params.do_sample && !sampling_params.is_prompt_lookup()) { - continue; - } - // update log prob just while validation process - if (!is_extend_sequence) { - OPENVINO_ASSERT(generated_and_verified_len < running_sequences[running_sequence_id]->get_generated_len()); - running_sequence->update_generated_log_prob(generated_and_verified_len, sampled_token.m_log_prob); + // flag to add sampled token to generated sequence or extend logit processors only + bool is_extend_sequence = token_offset == 0 || is_generate_n_tokens || !is_validation_passed; + if (is_validation_mode_enabled && !is_extend_sequence) { + is_validation_passed = + validate_candidate(running_sequences[running_sequence_id], + token_offset, + sampled_token, + is_extend_sequence, + assisting_pipeline_info.max_removed_tokens_per_request, + sampling_params.do_sample, !sampling_params.is_prompt_lookup()); + // doing resample in case of non accepted tokens in specualtive sampling + if (!is_validation_passed && sampling_params.do_sample) { + continue; + } + // update log prob just while validation process + if (!is_extend_sequence) { + OPENVINO_ASSERT(generated_and_verified_len < + running_sequences[running_sequence_id]->get_generated_len()); + running_sequence->update_generated_log_prob(generated_and_verified_len, + sampled_token.m_log_prob); + } } - } - register_new_token(sampled_token, running_sequences[running_sequence_id], logit_processor, is_extend_sequence, is_validation_mode_enabled); - - // to exit from sampling in case of failed token validation - if (!is_validation_passed) { - break; - } else { - auto sampling_params = sequence_group->get_sampling_parameters(); - if (is_stop_token_id_hit(sampled_token.m_index, sampling_params.stop_token_ids) && !sampling_params.ignore_eos) { - running_sequence->set_status(SequenceStatus::FINISHED); - running_sequence->set_finish_reason(GenerationFinishReason::STOP); - sg_sampling_info.sampler_output.m_dropped_sequences.push_back(running_sequence->get_id()); + register_new_token(sampled_token, + running_sequences[running_sequence_id], + logit_processor, + is_extend_sequence, + is_validation_mode_enabled); + // to exit from sampling in case of failed token validation + if (!is_validation_passed) { + break; + } else { + auto sampling_params = sequence_group->get_sampling_parameters(); + if (is_stop_token_id_hit(sampled_token.m_index, sampling_params.stop_token_ids) && + !sampling_params.ignore_eos) { + running_sequence->set_status(SequenceStatus::FINISHED); + running_sequence->set_finish_reason(GenerationFinishReason::STOP); + sg_sampling_info.sampler_output.m_dropped_sequences.push_back(running_sequence->get_id()); + } } } + assisting_pipeline_info.min_generated_len = + std::min(assisting_pipeline_info.min_generated_len, running_sequence->get_generated_len()); + } + align_all_sequence_len(sequence_group, assisting_pipeline_info.min_generated_len, logit_processor); + for (const auto& dropped_seq_id : _try_finish_generation(sequence_group)) { + sg_sampling_info.sampler_output.m_dropped_sequences.push_back(dropped_seq_id); } - assisting_pipeline_info.min_generated_len = std::min(assisting_pipeline_info.min_generated_len, running_sequence->get_generated_len()); } - align_all_sequence_len(sequence_group, assisting_pipeline_info.min_generated_len, logit_processor); - for (const auto& dropped_seq_id : _try_finish_generation(sequence_group)) { - sg_sampling_info.sampler_output.m_dropped_sequences.push_back(dropped_seq_id); + } else if (sampling_params.is_eagle_tree()) { + TopKSelector* topk_searcher; + { + uint64_t request_id = sequence_group->get_request_id(); + std::lock_guard lock(m_beam_search_info_mutex); + if (m_top_k_selector_info.find(request_id) == m_top_k_selector_info.end()) { + ov::Tensor empty_tensor; + m_top_k_selector_info.emplace(request_id, TopKSelector(sequence_group, (m_d2t ? m_d2t->get_tensor_view() : empty_tensor))); + } + topk_searcher = &m_top_k_selector_info.at(request_id); } + topk_searcher->select_top_k(sequence_group_logits, sg_sampling_info.sampler_output); } else if (sampling_params.is_beam_search()) { uint64_t request_id = sequence_group->get_request_id(); diff --git a/src/cpp/src/sampling/sampler.hpp b/src/cpp/src/sampling/sampler.hpp index ffbbcac3e3..b764eeb4b1 100644 --- a/src/cpp/src/sampling/sampler.hpp +++ b/src/cpp/src/sampling/sampler.hpp @@ -44,6 +44,15 @@ inline bool is_stop_token_id_hit_in_sequence_group(SequenceGroup::Ptr sequence_g } std::vector log_softmax(const ov::Tensor& logits, size_t batch_idx); +struct Eagle2ValidationResult { + size_t accepted_path_id = -1; + size_t accepted_path_length = 0; + std::vector accepted_tokens; + std::vector updated_log_probs; + bool is_path_accepted = false; + size_t rejected_at_position = 0; + Token extra_sampled_token; // sampled in main model from the rejection pos +}; struct SamplerOutput { // IDs of sequences that need to be dropped @@ -72,14 +81,40 @@ struct SequenceGroupSamplingInfo { class Sampler { class GroupBeamSearcher; + class TopKSelector; Logits _get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t token_idx); Token _greedy_sample(const Logits& logits, size_t top_logprobs) const; + Token _greedy_sample_with_batch_idx(ov::Tensor logits, size_t batch_idx, size_t token_idx); std::vector _multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence); std::vector _try_finish_generation(SequenceGroup::Ptr & sequence_group); bool validate_candidate(Sequence::Ptr running_sequence, size_t& token_idx, Token& sampled_token, bool& is_extend_sequence, size_t& max_removed_tokens, bool do_sample, bool has_real_probolities); + // EAGLE2 tree validation functions + Eagle2ValidationResult validate_eagle2_tree( + const std::vector>& candidate_paths, + const std::vector>& candidate_log_probs, + const std::vector beam_id, + const ov::Tensor& main_model_logits, + LogitProcessor& logit_processor, + bool do_sample = false); + + // Helper functions for EAGLE2 validation + size_t find_common_prefix_length(const std::vector>& paths); + + std::pair find_token_probability(const Logits& logits, int64_t token_id); + + Token get_highest_probability_token(const Logits& logits); + + size_t select_best_continuation_path( + const std::vector>& candidate_paths, + const std::vector>& candidate_log_probs, + const std::vector& beam_id, + const size_t& common_prefix_length, + Logits& logit, + const ov::Tensor& main_model_logits, + LogitProcessor& logit_processor); SequenceGroupSamplingInfo sample_from_sequence_group(SequenceGroup::Ptr sequence_group, ov::Tensor sequence_group_logits, LogitProcessor& logit_processor, const std::pair>& stop_strings, @@ -87,6 +122,8 @@ class Sampler { // request ID => beam search tracking information std::map m_beam_search_info; + + std::map m_top_k_selector_info; std::mutex m_beam_search_info_mutex; std::mt19937 rng_engine; @@ -99,6 +136,9 @@ class Sampler { Tokenizer m_tokenizer; ThreadPool m_thread_pool; + std::shared_ptr m_structured_output_controller; + std::shared_ptr m_d2t; // Tensor to store d2t mapping for eagle model + public: Sampler(const Sampler& rhs) = delete; Sampler(Sampler&& rhs) = delete; @@ -121,10 +161,31 @@ class Sampler { LogitProcessor& get_logit_processor(uint64_t request_id); void create_logit_processor(uint64_t request_id, const GenerationConfig& sampling_parameters, const TokenIds& prompt); + int validate_eagle2_candidates( + SequenceGroup::Ptr seq_group, + const ov::Tensor& main_model_logits, + LogitProcessor& logit_processor, + size_t& generated_tokens_count, + size_t& max_removed_tokens, + size_t& num_tokens_to_process, + bool do_sample = false); + + void clear_top_k_selector(uint64_t request_id); + + void set_d2t_for_decoding(std::shared_ptr& d2t) { + m_d2t = d2t; + }; + std::map get_beam_idxs(SequenceGroup::CPtr sequence_group); // pair with map with backend name and corresponding compiler init time, and vector of compile times for each concrete grammar std::pair, std::vector> get_structured_output_times(); void clear_structured_output_compile_times(); + TopKSelector* get_top_k_selector(uint64_t request_id) { + /*if (m_top_k_selector_info.find(request_id) == m_top_k_selector_info.end()) { + OPENVINO_ASSERT(false, "TopKSelector for request_id " + std::to_string(request_id) + " is not initialized"); + }*/ + return &m_top_k_selector_info.at(request_id); + }; }; class Sampler::GroupBeamSearcher { @@ -171,4 +232,240 @@ class Sampler::GroupBeamSearcher { void finalize(SamplerOutput& sampler_output); std::map get_beam_idxs(); }; -} + +class Sampler::TopKSelector { + struct Beam { + Sequence::Ptr m_sequence; + size_t m_global_beam_idx = 0; + + // beam is made on top of sequence + float m_log_prob = 0.0f; + int64_t m_token_id = -1; + int m_tree_layer = 0; // layer in the tree structure + int64_t m_node_id = 0; // unique ID for the beam in the tree structure + // cumulative log probabilities + float m_score = -std::numeric_limits::infinity(); + + Beam(Sequence::Ptr sequence) : m_sequence(std::move(sequence)) {} + + size_t get_generated_len() const { + return m_sequence->get_generated_len(); + } + }; + void tree_reset(SequenceGroup::Ptr& sequence_group); + static bool greater(const Beam& left, const Beam& right) { + return left.m_score > right.m_score; + } + struct CandidateNode { + Beam candidate_beam; + std::vector> children; + std::weak_ptr parent; + + uint64_t get_id() const { + return candidate_beam.m_node_id; + } + CandidateNode(Beam beam) : candidate_beam(std::move(beam)) {} + }; + class Eagle2CandidateGraph { + public: + Eagle2CandidateGraph(Beam root_beam, int k = 0, int max_depth = 0) + : total_tokens(k), + max_depth(max_depth), + current_depth(0), + next_node_id(1) { + root_beam.m_node_id = 0; // Root always has ID 0 + root_beam.m_tree_layer = 0; + root = std::make_shared(std::move(root_beam)); + + node_map[0] = root; + layer_to_nodes[0].push_back(root); // for access to nodes on same layer + } + + void add_candidate(Beam& new_beam, uint64_t parent_node_id) { + if (new_beam.m_tree_layer > max_depth) { + return; + } + + auto parent_it = node_map.find(parent_node_id); + if (parent_it == node_map.end()) { + OPENVINO_THROW("Parent node not found in candidate graph"); + } + + auto parent_node = parent_it->second; + + // Assign new node ID and update beam + new_beam.m_node_id = next_node_id++; + new_beam.m_tree_layer = parent_node->candidate_beam.m_tree_layer + 1; + + auto new_node = std::make_shared(new_beam); + new_node->parent = parent_node; + + // Add to parent's children + parent_node->children.push_back(new_node); + + // Update mappings + node_map[new_node->get_id()] = new_node; + layer_to_nodes[new_node->candidate_beam.m_tree_layer].push_back(new_node); + + // Update current depth + current_depth = std::max(current_depth, new_node->candidate_beam.m_tree_layer); + } + + std::vector get_top_k_candidates() { + if (total_tokens <= 0) + return {}; + + // Use min-heap to efficiently get top-k candidates + auto cmp = [](const std::shared_ptr& a, const std::shared_ptr& b) { + return a->candidate_beam.m_score > b->candidate_beam.m_score; // min-heap + }; + + std::priority_queue, + std::vector>, + decltype(cmp)> + min_heap(cmp); + + // BFS traversal to find all candidates (excluding root) + std::queue> bfs_queue; + bfs_queue.push(root); + + while (!bfs_queue.empty()) { + auto node = bfs_queue.front(); + bfs_queue.pop(); + + // Add non-root nodes to heap + if (node != root) { + if (min_heap.size() < static_cast(total_tokens)) { + min_heap.push(node); + } else if (node->candidate_beam.m_score > min_heap.top()->candidate_beam.m_score) { + min_heap.pop(); + min_heap.push(node); + } + } + + // Add children to BFS queue + for (const auto& child : node->children) { + bfs_queue.push(child); + } + } + + // Extract results and sort by score (descending) + std::vector result; + result.reserve(min_heap.size()); + + while (!min_heap.empty()) { + result.push_back(min_heap.top()->candidate_beam); + min_heap.pop(); + } + + std::sort(result.begin(), result.end(), [](const Beam& a, const Beam& b) { + return a.m_score > b.m_score; + }); + + return result; + } + + std::vector get_leaf_nodes_from_candidates(const std::vector& candidates) { + std::vector leaf_nodes; + std::unordered_set candidate_ids; + + // Build set of candidate node IDs + for (const auto& beam : candidates) { + candidate_ids.insert(beam.m_node_id); + } + + // Check each candidate to see if it's a leaf in the selected set + for (const auto& candidate_beam : candidates) { + auto node_it = node_map.find(candidate_beam.m_node_id); + if (node_it == node_map.end()) + continue; + + auto node = node_it->second; + + // Check if this node has any children in the candidate set + bool has_candidate_child = false; + for (const auto& child : node->children) { + if (candidate_ids.count(child->get_id()) > 0) { + has_candidate_child = true; + break; + } + } + + if (!has_candidate_child) { + leaf_nodes.push_back(candidate_beam); + } + } + + return leaf_nodes; + } + + std::vector get_path_to_node(uint64_t node_id) { + auto node_it = node_map.find(node_id); + if (node_it == node_map.end()) { + return {}; + } + + std::vector path; + auto current_node = node_it->second; + + // Traverse from node to root, collecting token IDs + while (current_node && current_node != root) { + path.push_back(current_node->candidate_beam.m_token_id); + current_node = current_node->parent.lock(); + } + // path.push_back(root->candidate_beam.m_token_id); // Add root + // Reverse to get path from root to node + std::reverse(path.begin(), path.end()); + return path; + } + + void print_tree() { // for debugging purposes + std::cout << "Eagle2 Candidate Tree (Depth: " << current_depth << ")\n"; + print_node(root, 0); + } + + private: + std::shared_ptr root; + uint64_t next_node_id; // for new node + + std::unordered_map> node_map; + std::unordered_map>> layer_to_nodes; + + int total_tokens; + int max_depth; + int current_depth; + + void print_node(const std::shared_ptr& node, int depth) { + std::string indent(depth * 2, ' '); + + if (node == root) { + std::cout << indent << "[ROOT] ID: " << node->get_id() << "\n"; + } else { + std::cout << indent << "ID: " << node->get_id() << " Token: " << node->candidate_beam.m_token_id + << " Score: " << node->candidate_beam.m_score + << " Layer: " << node->candidate_beam.m_tree_layer << "\n"; + } + + for (const auto& child : node->children) { + print_node(child, depth + 1); + } + } + }; + size_t m_tree_layer_counter = 0; + SequenceGroup::Ptr m_sequence_group; + std::shared_ptr m_eagle2_candidate_graph; + std::vector m_beams; + ov::genai::GenerationConfig m_parameters; + int64_t* m_d2t; // Draft-to-target token ID offset + +public: + explicit TopKSelector(SequenceGroup::Ptr sequence_group, ov::Tensor d2t); + + void select_top_k(const ov::Tensor& logits, SamplerOutput& sampler_output); + void finalize_eagle2_candidates(SamplerOutput& sampler_output); + // float get_eagle2_layer_weight(size_t layer) { + // return std::ext(-m_parameter.eagle_layer_decay * (layer - 1)); + //} + void apply_eagle2_scoring() {} // to be implemented +}; +} // namespace ov::genai diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 664ac665cf..4201915f22 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -20,7 +20,8 @@ enum class SequenceStatus { RUNNING = 0, FINISHED = 1, OUT_OF_MEMORY = 2, - WAITING = 3 + WAITING = 3, + CACHING = 4 // Sequence is waiting for top-k to be finialized }; enum class SequenceGroupType { @@ -42,6 +43,7 @@ class Sequence { TokenIds m_generated_ids; LogProbs m_generated_log_probs; + ov::Tensor m_hidden_state; uint64_t m_grouped_id; uint64_t m_id = _get_next_global_sequence_id(); SequenceStatus m_status = SequenceStatus::RUNNING; @@ -66,6 +68,8 @@ class Sequence { Sequence(const Sequence& seq, const uint64_t id) : m_generated_ids(seq.m_generated_ids), + m_generated_log_probs(seq.m_generated_log_probs), + m_hidden_state(seq.m_hidden_state), m_grouped_id(id), m_status(seq.m_status), m_cumulative_log_prob(seq.m_cumulative_log_prob), @@ -115,6 +119,10 @@ class Sequence { return m_status == SequenceStatus::WAITING; } + bool is_caching() const { + return m_status == SequenceStatus::CACHING; + } + void set_status(SequenceStatus status) { m_status = status; } @@ -127,6 +135,14 @@ class Sequence { m_finish_reason = finish_reason; } + void update_hidden_state(ov::Tensor tensor) { + m_hidden_state = tensor; + } + + ov::Tensor& get_hidden_state() { + return m_hidden_state; + } + // appends new tokens to a generated part void append_token(int64_t token_id, float log_prob) { m_cumulative_log_prob += log_prob; @@ -479,6 +495,27 @@ class SequenceGroup : public std::enable_shared_from_this { return running_seqs; } + std::vector get_caching_sequences() { + std::vector caching_seqs; + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_caching()) { + caching_seqs.emplace_back(m_sequences[seq_id]); + } + } + + return caching_seqs; + } + + std::vector get_caching_sequences() const { + std::vector caching_seqs; + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_caching()) { + caching_seqs.emplace_back(m_sequences[seq_id]); + } + } + + return caching_seqs; + } uint64_t get_request_id() const { return m_request_id; } @@ -561,7 +598,7 @@ class SequenceGroup : public std::enable_shared_from_this { m_num_validation_tokens = k; } - size_t get_num_tokens_to_validate() { + size_t get_num_tokens_to_validate() const { return m_num_validation_tokens; } @@ -643,6 +680,19 @@ class SequenceGroup : public std::enable_shared_from_this { return (get_context_len() - get_num_evicted_tokens() + m_block_size - 1) / m_block_size; } + /** + * @return The number of logical KV cache blocks required to host 1 extra token in this sequence group, taking into account previous token evictions. + */ + size_t get_num_logical_blocks_for_1_generation() const { + return (get_context_len() - get_num_evicted_tokens() - get_num_tokens_to_validate() + m_block_size - 1) / m_block_size; + } + /** + * @return The number of logical KV cache blocks required to host validation tokens in this sequence group, taking into account previous token evictions. + */ + size_t get_num_logical_blocks_for_validation_tokens() const { + return (get_context_len() - get_num_evicted_tokens() - 1 + m_block_size - 1) / m_block_size; + } + // requires number of physical blocks for next generation size_t get_num_blocks() const { return get_num_logical_blocks(); @@ -687,6 +737,15 @@ class SequenceGroup : public std::enable_shared_from_this { return m_is_gen_paused; } + bool is_caching() const { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_caching()) { + return true; // in the middle of drafting stage + } + } + return m_is_gen_paused; + } + GenerationStream::Ptr get_generation_stream() { return m_generation_stream; } diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp index def8f88372..71c6561830 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp @@ -16,6 +16,7 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::Contin m_generation_config = generation_config; m_is_validation_mode_enabled = is_validation_mode_enabled; initialize_pipeline(model, scheduler_config, device, plugin_config); + //m_candidate_graph = Eagle2CandidateGraph(m_generation_config.eagle_tree_width, m_generation_config.eagle_tree_depth); } void @@ -100,6 +101,41 @@ get_prefix_len( return { min_generated_tokens, min_candidate_len }; } +std::pair +get_prefix_len( + const std::vector& running_sequences, + const EagleGeneratedSequences& candidates) { + size_t min_generated_tokens = std::numeric_limits::max(), + min_candidate_len = std::numeric_limits::max(); + for (const auto& running_sequence : running_sequences) { + const auto& sequence_id = running_sequence->get_grouped_id(); + if (!candidates.count(sequence_id)) { + continue; + } + + const auto& candidate_sequence = candidates.at(sequence_id); + + const std::vector& candidate_token_ids = candidate_sequence.token_ids, + running_token_ids = running_sequence->get_generated_ids(); + + const size_t candidate_sequence_gen_len = candidate_token_ids.size(), + running_sequence_gen_len = running_sequence->get_generated_len(); + + // to find the len of prefix + size_t sequence_prefix_len = std::min(candidate_sequence_gen_len, running_sequence_gen_len); + for (size_t i = 0; i < sequence_prefix_len; ++i) { + if (candidate_token_ids[i] != running_token_ids[i]) { + sequence_prefix_len = i; + break; + } + } + + min_generated_tokens = std::min(sequence_prefix_len, min_generated_tokens); + min_candidate_len = std::min(candidate_sequence_gen_len, min_candidate_len); + } + return { min_generated_tokens, min_candidate_len }; +} + size_t remove_tokens_from_sequence(Sequence::Ptr& sequence, size_t min_generated_tokens, @@ -110,7 +146,7 @@ remove_tokens_from_sequence(Sequence::Ptr& sequence, size_t removed_token_cnt = sequence_generated_len - min_generated_tokens; for (size_t i = min_generated_tokens; i < sequence_generated_len; ++i) { - logit_proccessor.decrease_generated_token_occurance(generated_token_ids[i]); + logit_proccessor.decrease_generated_token_occurance(generated_token_ids[i]); } sequence->remove_last_tokens(removed_token_cnt); return (sequence_generated_len - min_generated_tokens); @@ -347,4 +383,415 @@ void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::m } } } + +// Eagle impl +ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::ContinuousBatchingForEagleDecodingImpl( + const std::shared_ptr& model, + const Tokenizer& tokenizer, + const GenerationConfig& generation_config, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& plugin_config, + bool is_validation_mode_enabled) { + m_tokenizer = tokenizer; + m_generation_config = generation_config; + m_is_validation_mode_enabled = is_validation_mode_enabled; + initialize_pipeline(model, scheduler_config, device, plugin_config); +} + +void +ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::finish_request(SequenceGroup::Ptr request) { + for (const auto& sequence: request->get_sequences()) { + if (m_scheduler->has_block_table(sequence->get_id())) { + m_scheduler->free_sequence(sequence->get_id()); + } + } + m_sampler->clear_request_info(request->get_request_id()); + request->set_generation_status(GenerationStatus::STOP); +} + +void ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::finish_request(int64_t request_id) { + auto it = m_requests.begin(); + while (it != m_requests.end()) { + auto& request = *it; + if (request->get_request_id() != request_id && request_id != -1) { + it++; + continue; + } + finish_request(request); + m_requests.erase(it); + it = request_id == -1 ? m_requests.begin() : m_requests.end(); + } + if (request_id == -1) { + OPENVINO_ASSERT(m_requests.empty()); + } +} + +EagleGeneratedRequests +ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::get_generated_requests() { + + EagleGeneratedRequests result; + for (const auto& request : m_requests) { + const auto& request_id = request->get_request_id(); + if (!result.count(request_id)) { + result.insert({request_id, {{}}}); + } + auto& generated_request = result[request_id]; + for (const auto& sequence : request->get_running_sequences()) { + const auto& sequence_id = sequence->get_grouped_id(); + OPENVINO_ASSERT(!generated_request.count(sequence_id)); + generated_request.insert({{sequence_id, { sequence->get_generated_ids(), sequence->get_generated_log_probs(), sequence->get_hidden_state()} }}); + } + } + return result; +} + +UpdateRequestResult +ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::init_request_by_candidate( + uint64_t request_id, + const GeneratedSequences& candidates) { + for (auto& request : m_requests) { + if (request->get_request_id() != request_id) { + continue; + } + + UpdateRequestResult result; + m_sampler->create_logit_processor(request_id, request->get_sampling_parameters(), request->get_prompt_ids()); + auto& logit_processor = m_sampler->get_logit_processor(request_id); + result.inserted_tokens_cnt = init_request(request, candidates, logit_processor, true, true); + request->set_num_validated_tokens(result.inserted_tokens_cnt); + return result; + } + return {0, 0}; +} + +UpdateRequestResult ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::update_main_request( + uint64_t request_id, + const EagleGeneratedSequences& candidates) { + UpdateRequestResult result{0, 0}; + for (auto& request : m_requests) { + if (request_id != request->get_request_id()) { + continue; + } + + // handle update main request first, at this point, main should already have a logit processor created + std::vector running_sequences = + request->get_running_sequences(); // main model sequences, should be only one sequence + OPENVINO_ASSERT(running_sequences.size() > 0); + if (running_sequences.front()->get_generated_len() == 0 && !request->get_num_tokens_to_validate()) { + m_sampler->create_logit_processor(request_id, + request->get_sampling_parameters(), + request->get_prompt_ids()); + auto& logit_processor = m_sampler->get_logit_processor(request_id); + result.inserted_tokens_cnt = 0; + // min_generated_tokens = result.inserted_tokens_cnt; + // min_candidate_len = result.inserted_tokens_cnt; + } else { + // for main request, beam search is not supported, so we should have only one sequence in request at this + // time always, otherwise, the main request has not finished validation yet, skip it + if (running_sequences.size() == 1) { + auto first_sequence = running_sequences.front(); + auto previously_grouped_id = first_sequence->get_grouped_id(); + size_t generated_len = first_sequence->get_generated_len(); + + std::map existing_sequences; + for (auto& seq : running_sequences) { + existing_sequences[seq->get_grouped_id()] = seq; + } + + std::vector> sequences_to_fork; + std::vector> sequences_to_update; + + for (const auto& candidate_sequence : candidates) { + size_t candidate_group_id = candidate_sequence.first; + const auto& candidate_data = candidate_sequence.second; + + if (previously_grouped_id == candidate_group_id) { + sequences_to_update.push_back(candidate_sequence); + } else { + sequences_to_fork.push_back(candidate_sequence); + } + } + for (const auto& candidate_sequence : sequences_to_fork) { + size_t candidate_group_id = candidate_sequence.first; + const auto& candidate_data = candidate_sequence.second; + + Sequence::Ptr target_sequence = Sequence::fork(first_sequence, candidate_group_id); + m_scheduler->fork_sequence(first_sequence->get_id(), target_sequence->get_id()); + target_sequence->set_status(ov::genai::SequenceStatus::RUNNING); + request->add_sequence(target_sequence); + + auto token_ids = candidate_data.token_ids; + auto log_probs = candidate_data.log_probs; + size_t min_candidate_len = std::min(token_ids.size(), log_probs.size()); + token_ids.resize(min_candidate_len); + log_probs.resize(min_candidate_len); + + size_t current_generated_len = target_sequence->get_generated_len(); + for (size_t i = current_generated_len; i < min_candidate_len; ++i) { + target_sequence->append_token(token_ids[i], log_probs[i]); + } + } + for (const auto& candidate_sequence : sequences_to_update) { + size_t candidate_group_id = candidate_sequence.first; + const auto& candidate_data = candidate_sequence.second; + + auto token_ids = candidate_data.token_ids; + auto log_probs = candidate_data.log_probs; + size_t min_candidate_len = std::min(token_ids.size(), log_probs.size()); + token_ids.resize(min_candidate_len); + log_probs.resize(min_candidate_len); + + size_t current_generated_len = first_sequence->get_generated_len(); + for (size_t i = current_generated_len; i < min_candidate_len; ++i) { + first_sequence->append_token(token_ids[i], log_probs[i]); + } + } + auto it = std::find_if(sequences_to_update.begin(), + sequences_to_update.end(), + [previously_grouped_id](const std::pair& p) { + return p.first == previously_grouped_id; + }); + if (it == sequences_to_update.end()) { + // free as not further needed + first_sequence->set_status(ov::genai::SequenceStatus::FINISHED); + request->remove_sequence(first_sequence->get_id()); + m_scheduler->free_sequence(first_sequence->get_id()); + } + + result.inserted_tokens_cnt = request->get_running_sequences().front()->get_generated_len() - + generated_len; // align sequence before validation + } + } + // update request context information to provide correct scheduling phase + if (result.inserted_tokens_cnt > 0 && result.removed_tokens_cnt == 0) { + request->set_num_validated_tokens(result.inserted_tokens_cnt); + } + break; + } + return result; +} + +UpdateRequestResult ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::update_draft_request( + uint64_t request_id, + const EagleGeneratedSequences& candidates) { + // hidden state + // m_model_runner->set_hidden_state(request_id, candidates.begin()->first, hidden_state); + UpdateRequestResult result{0, 0}; + size_t adjust_len = 0; + for (auto& request : m_requests) { + if (request_id != request->get_request_id()) { + continue; + } + + std::vector running_sequences = request->get_running_sequences(); + OPENVINO_ASSERT(running_sequences.size() > 0); + size_t min_generated_tokens, min_candidate_len; + size_t validate_length = 0; + bool pause_due_to_main_not_validated = false; + if (running_sequences.front()->get_generated_len() == 0 && !request->get_num_tokens_to_validate()) { + // for first token append stage + OPENVINO_ASSERT(running_sequences.size() == 1, + "draft model should have only one sequence in request at this point."); + m_sampler->create_logit_processor(request_id, + request->get_sampling_parameters(), + request->get_prompt_ids()); + // auto& logit_processor = m_sampler->get_logit_processor(request_id); + auto candidate = candidates.begin(); + auto sequence = running_sequences.front(); + m_model_runner->set_initial_hidden_state(request_id, + //sequence->get_grouped_id(), + candidate->second.feature_vector); + + auto token_ids = candidate->second.token_ids; + auto log_probs = candidate->second.log_probs; + + for (size_t i = 0; i < token_ids.size(); ++i) { + sequence->append_token(token_ids[i], log_probs[i]); + // logit_processor.register_new_generated_token(token_ids[i]); + // logit_processor.update_generated_len(sequence->get_generated_len()); + } + result.inserted_tokens_cnt = token_ids.size(); + min_generated_tokens = result.inserted_tokens_cnt; + min_candidate_len = result.inserted_tokens_cnt; + } else { + // for generation stage + // at this point, we should have one beam selected, now update draft request of same group id + // in CB mode, the draft may not been validated yet, skip in this case + // TBD: what if eagle tree only produces one candidate branch? + auto main_validation_finished = [&] () { + if (running_sequences.size() != candidates.size()) { + return true; + } + for (const auto& running_sequence : running_sequences) { + size_t sequence_group_id = running_sequence->get_grouped_id(); + auto candidate_it = candidates.find(sequence_group_id); + + const auto& running_generated_ids = running_sequence->get_generated_ids(); + const auto& candidate_token_ids = candidate_it->second.token_ids; + + if (running_generated_ids.size() != candidate_token_ids.size()) { + return true; + } + + for (size_t i = 0; i < running_generated_ids.size(); ++i) { + if (running_generated_ids[i] != candidate_token_ids[i]) { + return true; + } + } + } + pause_due_to_main_not_validated = true; + return false; + }; + + if (main_validation_finished()) { // update draft only after main validation is done + auto selected_beam = candidates.begin(); + auto& logit_processor = m_sampler->get_logit_processor(request_id); + std::tie(min_generated_tokens, min_candidate_len) = get_prefix_len(running_sequences, candidates); + for (auto& running_sequence : running_sequences) { + if (running_sequence->get_grouped_id() != selected_beam->first) { + running_sequence->set_status(ov::genai::SequenceStatus::FINISHED); + request->remove_sequence(running_sequence->get_id()); + // drop the sequence, as it will not be used anymore + m_scheduler->free_sequence(running_sequence->get_id()); + continue; + } + const auto generated_token_ids = running_sequence->get_generated_ids(); + const auto sequence_generated_len = running_sequence->get_generated_ids().size(); + OPENVINO_ASSERT(sequence_generated_len >= min_generated_tokens); + + result.removed_tokens_cnt = sequence_generated_len - min_generated_tokens; + running_sequence->remove_last_tokens(result.removed_tokens_cnt); + // update feature_vector, remove last removed_tokens_cnt + auto& hidden_state = selected_beam->second.feature_vector; + // update ov::Tensor + ov::Tensor updated_hidden_state = + truncate_hidden_state_from_end(hidden_state, result.removed_tokens_cnt); + adjust_len = request->get_sampling_parameters().eagle_tree_params.tree_depth + 2 - hidden_state.get_shape()[0]; + m_model_runner->set_initial_hidden_state(request_id, + //running_sequence->get_grouped_id(), + updated_hidden_state); + validate_length = updated_hidden_state.get_shape().size() > 0 ? updated_hidden_state.get_shape()[0] : 0; + auto candidate_sequence = candidates.at(running_sequence->get_grouped_id()); + std::vector candidate_token_ids = candidate_sequence.token_ids; + std::vector candidate_token_log_probs = candidate_sequence.log_probs; + candidate_token_ids.resize(min_candidate_len); + candidate_token_log_probs.resize(min_candidate_len); + result.inserted_tokens_cnt = insert_tokens_to_sequence(running_sequence, + candidate_token_ids, + candidate_token_log_probs, + logit_processor, + false); + } + } + } + if (!pause_due_to_main_not_validated) { + // update request context information to provide correct scheduling phase + const size_t num_processed_tokens = request->get_num_processed_tokens(), prompt_len = request->get_prompt_len(), + updated_context_len = min_candidate_len + prompt_len, + max_new_tokens = request->get_max_new_tokens(); + size_t generated_len = request->get_context_len() >= request->get_prompt_len() + ? request->get_context_len() - request->get_prompt_len() + 1 + : 0; + if (generated_len > 0 && validate_length > 0) { + // processed token number in draft + request->update_processed_tokens_num(num_processed_tokens - (result.removed_tokens_cnt + adjust_len) + 1 - + validate_length + 1); + } + if (validate_length == 0 && result.inserted_tokens_cnt > 0 && result.removed_tokens_cnt == 0) { + request->set_num_validated_tokens(result.inserted_tokens_cnt); + } else if (validate_length > 0) { + request->set_num_validated_tokens(validate_length - 1); // in generation stage + } + // to pause `draft_model` generation in case of `generated_len >= max_new_tokens - 1` to generate last token by + // `main_model` + if (!m_is_validation_mode_enabled) { + bool pause_gen_status = false; + generated_len -= result.removed_tokens_cnt; + generated_len += result.inserted_tokens_cnt; + if (generated_len >= max_new_tokens - 1 || result.inserted_tokens_cnt == 0) { + pause_gen_status = true; + } + request->pause_generation(pause_gen_status); + } + } else { + request->pause_generation(true); // pause draft model generation, and keep draft as it is, as main has not scheduled validation yet + } + break; + } + + return result; +} + +bool ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::is_requests_empty() { + return m_requests.empty(); +} + +size_t ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::get_processed_tokens_per_iteration() { + return m_batch_size; +} + +void ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::pull_awaiting_requests(bool is_pause_request) { + std::lock_guard lock{m_awaiting_requests_mutex}; + if (is_pause_request) { + for (auto& awaiting_request : m_awaiting_requests) { + awaiting_request->pause_generation(true); + } + } + m_requests.insert(m_requests.end(), m_awaiting_requests.begin(), m_awaiting_requests.end()); + m_awaiting_requests.clear(); +} + +void ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl::multistep() { + bool to_generate = true; + size_t generated_tokens_cnt = 0; + size_t step_count = 0; + auto depth = m_requests[0]->get_sampling_parameters().eagle_tree_params.tree_depth; + // cycle to generate several tokens per one iteration for speculative decoding case + while (to_generate) { + generated_tokens_cnt++; + step_count++; + ManualTimer multistep_timer("speculative_decoding: multistep()"); + multistep_timer.start(); + step(); + multistep_timer.end(); + + const auto num_generated_tokens = get_processed_tokens_per_iteration(); + auto pipeline_metrics = get_metrics(); + if (num_generated_tokens > 0) { + auto generation_duration = multistep_timer.get_duration_microsec(); + raw_perf_metrics.m_durations.emplace_back(generation_duration); + raw_perf_metrics.m_inference_durations[0] = MicroSeconds(pipeline_metrics.inference_duration); + raw_perf_metrics.m_batch_sizes.emplace_back(num_generated_tokens); + } + + m_model_runner->set_hidden_state_import_needed(false); + to_generate = false; + /*if (step_count >= depth + 1) { + for (auto& request : m_requests) { + auto top_k = m_sampler->get_top_k_selector(request->get_request_id()); + top_k->finalize_eagle2_candidates(); + } + }*/ + for (auto& request : m_requests) { + const auto& sampling_params = request->get_sampling_parameters(); + if (0) { //! sampling_params.is_assisting_generation()) { + // generate only one token in case of non speculative decoding + // request->pause_generation(true); + } else if (request->get_num_processed_tokens() == 0 && sampling_params.num_return_sequences > 1) { + request->pause_generation(true); + } else if (request->get_max_new_tokens() == 0) { + request->pause_generation(true); + } else if (request->get_num_processed_tokens() == request->get_prompt_len()) { + request->pause_generation(true); + } // else if (is_stop_token_id_hit_in_sequence_group(request, sampling_params.stop_token_ids)) { + // request->pause_generation(true); + else if (sampling_params.eagle_tree_params.tree_depth > 0 && step_count >= sampling_params.eagle_tree_params.tree_depth + 1) { + request->pause_generation(true); + } + to_generate |= request->can_generate_tokens(); + } + } + m_model_runner->set_hidden_state_import_needed(true); } +} // namespace ov::genai diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp index f81c7f2d37..afd883d037 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp @@ -3,13 +3,13 @@ #pragma once -#include "openvino/genai/continuous_batching_pipeline.hpp" - #include "continuous_batching/pipeline_impl.hpp" +#include "openvino/genai/continuous_batching_pipeline.hpp" #include "speculative_decoding/update_request_structs.hpp" namespace ov::genai { -class ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl : public ContinuousBatchingPipeline::ContinuousBatchingImpl { +class ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl + : public ContinuousBatchingPipeline::ContinuousBatchingImpl { public: ContinuousBatchingForSpeculativeDecodingImpl() = default; @@ -39,4 +39,96 @@ class ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl : void finish_request(SequenceGroup::Ptr request); void _pull_awaiting_requests() override {}; }; -} + +class ContinuousBatchingPipeline::ContinuousBatchingForEagleDecodingImpl + : public ContinuousBatchingPipeline::ContinuousBatchingImpl { +public: + ContinuousBatchingForEagleDecodingImpl() = default; + + ContinuousBatchingForEagleDecodingImpl(const std::shared_ptr& model, + const Tokenizer& tokenizer, + const GenerationConfig& generation_config, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& plugin_config, + bool is_validation_mode_enabled); + + void multistep(); + void finish_request(int64_t request_id = -1); + void pull_awaiting_requests(bool is_pause_request = false); + EagleGeneratedRequests get_generated_requests(); + UpdateRequestResult update_main_request(uint64_t request_id, const EagleGeneratedSequences& candidates); + UpdateRequestResult update_draft_request(uint64_t request_id, const EagleGeneratedSequences& candidates); + void clear_sampler_top_k_selector(uint64_t request_id) { + if (m_sampler) { + m_sampler->clear_top_k_selector(request_id); + } + } + bool is_requests_empty(); + + void set_d2t_for_draft_decoding(std::shared_ptr& d2t) { + if (m_sampler) { + m_sampler->set_d2t_for_decoding(d2t); + } + } + void set_hidden_state_export_needed(bool is_needed) { + if (m_model_runner) { + m_model_runner->set_hidden_state_export_needed(is_needed); + } + } + + void set_hidden_state_import_needed(bool is_needed) { + if (m_model_runner) { + m_model_runner->set_hidden_state_import_needed(is_needed); + } + } + + void set_hidden_state_internal_needed(bool is_needed) { + if (m_model_runner) { + m_model_runner->set_hidden_state_internal_needed(is_needed); + } + } + size_t get_processed_tokens_per_iteration(); + + UpdateRequestResult init_request_by_candidate(uint64_t request_id, const GeneratedSequences& candidates); + RawPerfMetrics raw_perf_metrics; +protected: + void finish_request(SequenceGroup::Ptr request); + void _pull_awaiting_requests() override {}; + ov::Tensor truncate_hidden_state_from_end(const ov::Tensor& hidden_state, size_t tokens_to_remove) { + if (hidden_state.get_size() == 0 || tokens_to_remove == 0) { + return hidden_state; + } + + auto shape = hidden_state.get_shape(); + if (shape.size() < 2) { + return hidden_state; + } + + size_t seq_len_dim = 0; + size_t current_seq_len = shape[seq_len_dim]; + + if (tokens_to_remove >= current_seq_len) { + ov::Shape new_shape = shape; + new_shape[seq_len_dim] = 0; + return ov::Tensor(hidden_state.get_element_type(), new_shape); + } + + size_t new_seq_len = current_seq_len - tokens_to_remove; + + ov::Coordinate start_coord(shape.size(), 0); + ov::Coordinate end_coord(shape.size(), 0); + + for (size_t i = 0; i < shape.size(); ++i) { + start_coord[i] = 0; + if (i == seq_len_dim) { + end_coord[i] = new_seq_len; + } else { + end_coord[i] = shape[i]; + } + } + + return ov::Tensor(hidden_state, start_coord, end_coord); + } +}; +} // namespace ov::genai diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index c98e0d7542..ada63d1946 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -365,4 +365,761 @@ std::vector ContinuousBatchingPipeline::SpeculativeDecodingI OPENVINO_ASSERT(main_awaiting_requests.size() == draft_awaiting_requests.size()); return main_awaiting_requests; } +// end of speculative_decoding_impl + +void extract_hidden_state_generic(std::shared_ptr& model, + const std::string& eagle_version, + const std::string& model_type, + const std::string& custom_node_name = "") { + if (eagle_version == "EAGLE2" || model_type == "draft") { // for draft model, we always only need to extract last hidden state + std::cout << model_type << " model - last hidden state extraction" << std::endl; + ov::pass::Manager pm; + std::vector layers = {-1}; // -1 means last hidden layer + pm.register_pass(layers, eagle_version); + pm.run_passes(model); + } else if (eagle_version == "EAGLE3") { + std::cout << model_type << " model - Eagle 3 hidden state extraction" << std::endl; + ov::pass::Manager pm; + /*if idx==len(self.layers)-3 or idx==len(self.layers)//2 or idx==2: + all_hidden_states += (hidden_states,)*/ + std::vector layers = {2, 16, 29}; // need to add check, only support positive values + pm.register_pass(layers, eagle_version); + pm.run_passes(model); + } else { + std::cerr << "Error: " << model_type << " model - Unsupported eagle version: " << eagle_version << std::endl; + } +} + +EagleModelTransform::EagleModelTransform(const std::vector& layers, const std::string& eagle_version) : m_layer_ids(layers), m_eagle_version(eagle_version) { +} + +bool EagleModelTransform::run_on_model(const std::shared_ptr& model) { + //m_model = model; + m_new_parameters.clear(); + m_new_results.clear(); + if (m_layer_ids.size() == 1 && m_layer_ids[0] == -1) { + ov::pass::Manager manager; + manager.set_per_pass_validation(false); + manager.register_pass(m_new_parameters, m_new_results, m_eagle_version); + manager.run_passes(model); + + if (!m_new_results.empty()) { + model->add_results(m_new_results); + std::cout << "EagleModelTransform - Added last hidden output " << std::endl; + } + if (m_eagle_version == "EAGLE3") { + ov::pass::Manager manager; + manager.set_per_pass_validation(false); + m_new_parameters = model->get_parameters(); + manager.register_pass(m_new_parameters); + manager.run_passes(model); + + model->add_parameters({m_new_parameters.back()}); + + } + return true; + } else { + ov::pass::Manager manager; + manager.set_per_pass_validation(false); + manager.register_pass(m_layer_ids, m_hidden_layer_outputs); + manager.run_passes(model); + + if (!m_hidden_layer_outputs.empty()) { + std::cout << "EagleModelTransform - extracted intermediate hidden state outputs " << std::endl; + auto concat = std::make_shared(m_hidden_layer_outputs, -1); + concat->set_friendly_name("eagle3_hidden_states_concat"); + + auto result = std::make_shared(concat); + std::string output_name = "last_hidden_state"; + result->output(0).set_names({output_name}); + result->set_friendly_name(output_name); + model->add_results({result}); + + std::cout << "EagleModelTransform - Added concated eagle3 hidden state output" << std::endl; + return true; + } + } + + return false; +} + +EagleInputTransform::EagleInputTransform(std::vector>& params) { + register_matcher( + std::make_shared(ov::pass::pattern::wrap_type(), this->get_type_info().name), + ([¶ms, this](ov::pass::pattern::Matcher& m) { + auto node = m.get_match_root(); + try { + if (apply(node, params)) { + ++applied; // FIXME: For debugging purposes only + return true; + } + } catch (...) { + OPENVINO_ASSERT(false, "EagleTransform failed to apply"); + } + return false; + }) + ); +} +bool EagleInputTransform::apply(NodePtr node, std::vector>& params) { + if (ov::is_type(node)) { + auto matmul_node = ov::as_type_ptr(node); + if (matmul_node->get_friendly_name().find("__module.model.fc/ov_ext::linear/MatMul") == std::string::npos) { // hardcode for now + return false; + } + auto shape = node->get_output_partial_shape(0); + auto internal_hidden_state = std::make_shared(node->get_element_type(), node->get_output_partial_shape(0)); + internal_hidden_state->output(0).set_names({"internal_hidden_state_input"}); + internal_hidden_state->set_friendly_name("internal_hidden_state_input"); + // create new eltwise node to add output of MatMul node and + auto new_eltwise = std::make_shared(internal_hidden_state, matmul_node->output(0)); + ov::replace_node(matmul_node, new_eltwise); + params.push_back(internal_hidden_state); + std::cout << "EagleInputTransform - Added internal hidden state input parameter" << std::endl; + return true; + } + /*if (ov::is_type(node)) { + auto target_node = ov::as_type_ptr(node); + if (target_node->get_friendly_name().find("__module.model.fc/ov_ext::linear/MatMul") == std::string::npos) { // hardcode for now + return false; + } + auto extract_then_branch = [](const NodePtr& node) -> std::shared_ptr { + auto input_target_hidden = node->get_input_node_shared_ptr(0); + + + } + auto then_body = extract_then_branch(matched_node); + auto else_body = extract_else_branch(matched_node); + + auto if_op = std::make_shared(condition); + if_op->set_then_body(then_body); + if_op->set_else_body(else_body); + replace_node(target_node, if_op); + return true; + }*/ + +} + +EagleBaseTransform::EagleBaseTransform(std::vector>& params, std::vector>& results, const std::string& eagle_version) : m_eagle_version(eagle_version) { + register_matcher( + std::make_shared(ov::pass::pattern::wrap_type(), this->get_type_info().name), + ([¶ms, &results, this](ov::pass::pattern::Matcher& m) { + auto node = m.get_match_root(); + try { + if (apply(node, params, results)) { + ++applied; // FIXME: For debugging purposes only + return true; + } + } catch (...) { + OPENVINO_ASSERT(false, "EagleTransform failed to apply"); + } + return false; + }) + ); +} + +std::shared_ptr EagleBaseTransform::find_last_hidden_node(const std::shared_ptr& start_node, + std::set& visited_nodes) { + if (visited_nodes.count(start_node.get())) { + return nullptr; + } + + visited_nodes.insert(start_node.get()); + + if (ov::is_type(start_node)) { + // check the input nodes of MatMul, if found Gather node, return the gather node, otherwise ,retrun the matmul node + for (size_t i = 0; i < start_node->get_input_size(); ++i) { + auto input_node = start_node->get_input_node_shared_ptr(i); + if (!input_node) continue; + // rule out logit processing node + if (ov::as_type_ptr(input_node)) { + return input_node; + } + } + return start_node; + } + + for (size_t i = 0; i < start_node->get_input_size(); ++i) { + auto input_node = start_node->get_input_node_shared_ptr(i); + if (!input_node) continue; + + auto result = find_last_hidden_node(input_node, visited_nodes); + if (result) { + return result; + } + } + return nullptr; +} + +std::shared_ptr EagleBaseTransform::find_last_residual_node(const std::shared_ptr& start_node, + std::set& visited_nodes) { + if (visited_nodes.count(start_node.get())) { + return nullptr; + } + + visited_nodes.insert(start_node.get()); + + if (ov::is_type(start_node)) { + // check the input nodes of MatMul, if found Gather node, return the gather node, otherwise ,retrun the matmul node + for (size_t i = 0; i < start_node->get_input_size(); ++i) { + auto input_node = start_node->get_input_node_shared_ptr(i); + if (!input_node) continue; + if (ov::as_type_ptr(input_node)) { + return start_node; // return the Add node itself + } + } + } + + for (size_t i = 0; i < start_node->get_input_size(); ++i) { + auto input_node = start_node->get_input_node_shared_ptr(i); + if (!input_node) continue; + + auto result = find_last_residual_node(input_node, visited_nodes); + if (result) { + return result; + } + } + return nullptr; +} + +std::shared_ptr EagleBaseTransform::find_last_hidden_node(const std::shared_ptr& start_node) { + std::set visited_nodes; + return find_last_hidden_node(start_node, visited_nodes); +} + +std::shared_ptr EagleBaseTransform::find_last_residual_node(const std::shared_ptr& start_node) { + std::set visited_nodes; + return find_last_residual_node(start_node, visited_nodes); +} + +bool EagleBaseTransform::apply(NodePtr node, std::vector>& params, std::vector>& results) { + if (m_eagle_version == "EAGLE2") { + if (ov::is_type(node)) { + // we are applying transformation to the last hidden state, eagle2 mode + NodePtr input_node = node->get_input_node_shared_ptr(0); + if (!input_node) { + return false; + } + auto last_hidden_node = find_last_hidden_node(input_node); + if (!last_hidden_node) { + return false; + } + // + std::shared_ptr non_const_input = nullptr; + size_t non_const_input_idx = 0; + + for (size_t i = 0; i < last_hidden_node->get_input_size(); ++i) { + auto input_node = last_hidden_node->get_input_node_shared_ptr(i); + if (!input_node) continue; + + if (!(ov::is_type(input_node)) && !(ov::is_type(input_node))) { + non_const_input = input_node; + non_const_input_idx = i; + break; + } + } + + if (!non_const_input) { + return false; + } + + auto result = std::make_shared(last_hidden_node->input_value(non_const_input_idx)); + std::string output_name = "last_hidden_state"; + result->output(0).set_names({output_name}); + result->set_friendly_name(output_name); + results.push_back(result); + return true; + } + return false; + } else if (m_eagle_version == "EAGLE3") { + // 1. with normalization layer 2. add extra input + if (ov::is_type(node)) { + // we are applying transformation to the last hidden state, eagle2 mode + NodePtr input_node = node->get_input_node_shared_ptr(0); + if (!input_node) { + return false; + } + auto last_residual_node = find_last_residual_node(input_node); + if (!last_residual_node) { + return false; + } + // + auto result = std::make_shared(last_residual_node); + std::string output_name = "last_hidden_state"; + result->output(0).set_names({output_name}); + result->set_friendly_name(output_name); + results.push_back(result); + return true; + } + return false; + } +} + +Eagle3Transform::Eagle3Transform(const std::vector& layers, std::vector>& hidden_state_outputs) : m_layers(layers) { + auto is_target_pattern = [&](const Output& output) { + auto add_node = ov::as_type_ptr(output.get_node_shared_ptr()); + auto add_node_name = add_node->get_friendly_name(); + if (add_node_name.find("self_attn") != std::string::npos) + return false; // Skip self-attention layers + bool layer_matched = false; + for (auto layer_idx : m_layers) { + if (add_node_name.find("layers." + std::to_string(layer_idx) + "/") != std::string::npos) { + layer_matched = true; + break; + } + } + + if (!layer_matched) { + return false; // Skip layers that are not in the specified layers + } + auto input0 = add_node->get_input_node_shared_ptr(1); + if (!input0 || !ov::is_type(input0)) { + return false; + } + auto matmul_node = input0; + auto matmul_input = matmul_node->get_input_node_shared_ptr(0); + if (!matmul_input) { + return false; + } + + bool has_multiply = ov::is_type(matmul_input); // ACT(up) dot gate + return has_multiply; + }; + + auto hidden_layer = ov::pass::pattern::wrap_type(is_target_pattern); + register_matcher(std::make_shared(hidden_layer, "Eagle3Transform::hidden_extraction"), + [&hidden_state_outputs, this](ov::pass::pattern::Matcher& m) { + auto node = m.get_match_root(); + try { + if (apply(node, hidden_state_outputs)) { + ++applied; // FIXME: For debugging purposes only + return true; + } + } catch (...) { + OPENVINO_ASSERT(false, "Eagle3Transform failed to apply"); + } + return false; + } + ); +} + +bool Eagle3Transform::apply(NodePtr node, std::vector>& hidden_state_outputs) { + if (ov::is_type(node)) { + auto add_node = std::dynamic_pointer_cast(node); + if (!add_node) { + return false; + } + hidden_state_outputs.push_back(add_node->output(0)); + return true; + } + return false; +} + + +ContinuousBatchingPipeline::EagleDecodingImpl::EagleDecodingImpl(const ov::genai::ModelDesc& main_model_desc, + const ov::genai::ModelDesc& draft_model_desc, + const std::string& eagle_version) : m_eagle_version(eagle_version) { + auto main_model = main_model_desc.model; + auto draft_model = draft_model_desc.model; + + auto main_scheduler_config = main_model_desc.scheduler_config; + auto main_device = main_model_desc.device; + + utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction); + utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction); + + utils::apply_gather_before_matmul_transformation(main_model); + utils::apply_gather_before_matmul_transformation(draft_model); + + std::string draft_device = draft_model_desc.device.empty() ? main_model_desc.device : draft_model_desc.device; + bool is_draft_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig(); + + ov::genai::SchedulerConfig main_scheduler_config_updated = main_scheduler_config, + draft_scheduler_config = is_draft_scheduler_undefined + ? main_scheduler_config + : draft_model_desc.scheduler_config; + + if (is_draft_scheduler_undefined) { + // split KV cache to 2 caches for main and draft models + auto compute_total_hidden_size = [](const std::shared_ptr& model) -> size_t { + size_t total_hidden_size = 0; + for (const auto& param_ptr : model->get_parameters()) { + const auto& name = param_ptr->get_friendly_name(); + if (name.find("key_cache.") == 0) { + auto pa_op = param_ptr->get_output_target_inputs(0).begin()->get_node(); + const auto& rt_info = pa_op->get_rt_info(); + total_hidden_size += rt_info.at("num_k_heads").as() * rt_info.at("k_head_size").as() + + rt_info.at("num_v_heads").as() * rt_info.at("v_head_size").as(); + } + } + return total_hidden_size; + }; + float main_model_hidden_size = compute_total_hidden_size(main_model), + draft_model_hidden_size = compute_total_hidden_size(draft_model); + auto k = draft_model_hidden_size / (main_model_hidden_size + draft_model_hidden_size); + + // TODO: work with KV blocks as it will be more precise instead of GBs + size_t main_cache_size = std::ceil(main_scheduler_config.cache_size * (1.f - k)), + draft_cache_size = main_scheduler_config.cache_size - main_cache_size; + if (draft_cache_size == 0 && main_cache_size > 0) { + main_cache_size -= (main_cache_size > 1 ? 1 : 0); + draft_cache_size = 1; + } + + main_scheduler_config_updated.cache_size = main_cache_size; + draft_scheduler_config.cache_size = draft_cache_size; + } + + ov::AnyMap draft_properties = + draft_model_desc.properties.empty() ? main_model_desc.properties : draft_model_desc.properties; + + // main and draft model can have different tokenizers + // to do: support retokenization: 154103 + Tokenizer main_model_tokenizer = main_model_desc.tokenizer; + Tokenizer draft_model_tokenizer = draft_model_desc.tokenizer; + + // todo: remove this condition after support of CVS-154103 + OPENVINO_ASSERT(are_tokenizers_equal(main_model_tokenizer, draft_model_tokenizer), + "Tokenizers for draft and main models are different!"); + + m_tokenizer = main_model_tokenizer; + // for eagle model, we need to obtain hidden layer state as extra output + // apply transformations needed to run eagle model + // target model: hidden state extraction, draft model: hidden state import , hidden state extraction + // eagle3 specific : dt importing + extract_hidden_state_generic(main_model, m_eagle_version, "main", ""); + extract_hidden_state_generic(draft_model, m_eagle_version, "draft", ""); + ov::serialize(draft_model, "bell_after.xml"); + + // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode + m_main_pipeline = std::make_shared(main_model, + main_model_tokenizer, + main_model_desc.generation_config, + main_scheduler_config_updated, + main_device, + main_model_desc.properties, + true); + m_draft_pipeline = std::make_shared(draft_model, + draft_model_tokenizer, + draft_model_desc.generation_config, + draft_scheduler_config, + draft_device, + draft_properties, + false); + m_perf_metrics = ov::genai::SDPerModelsPerfMetrics(); + m_perf_metrics.raw_metrics.m_inference_durations = {{MicroSeconds(0.0f)}}; +} + +GenerationHandle ContinuousBatchingPipeline::EagleDecodingImpl::add_request( + uint64_t request_id, + const ov::Tensor& input_ids, + ov::genai::GenerationConfig sampling_params, + std::optional token_type_ids) { + std::lock_guard lock(m_draft_generations_mutex); + auto draft_sampling_params = sampling_params; + draft_sampling_params.ignore_eos = true; + draft_sampling_params.stop_strings = {}; + m_draft_generations.insert( + {request_id, m_draft_pipeline->add_request(request_id, input_ids, draft_sampling_params)}); + return m_main_pipeline->add_request(request_id, input_ids, sampling_params); +}; + +GenerationHandle ContinuousBatchingPipeline::EagleDecodingImpl::add_request( + uint64_t request_id, + const std::string& prompt, + ov::genai::GenerationConfig sampling_params) { + std::lock_guard lock(m_draft_generations_mutex); + auto draft_sampling_params = sampling_params; + draft_sampling_params.ignore_eos = true; + draft_sampling_params.stop_strings = {}; + m_draft_generations.insert({request_id, m_draft_pipeline->add_request(request_id, prompt, draft_sampling_params)}); + return m_main_pipeline->add_request(request_id, prompt, sampling_params); +} + +bool ContinuousBatchingPipeline::EagleDecodingImpl::has_non_finished_requests() { + return m_main_pipeline->has_non_finished_requests(); +} + +void ContinuousBatchingPipeline::EagleDecodingImpl::step() { + // this blocks adding new requests during step as it may break coherence between main and draft models + std::lock_guard lock{m_draft_generations_mutex}; + auto& raw_perf_counters = m_perf_metrics.raw_metrics; + auto& main_raw_perf_counters = m_perf_metrics.main_model_metrics.raw_metrics; + + ManualTimer step_timer("speculative_decoding: step()"); + step_timer.start(); + + m_draft_pipeline->pull_awaiting_requests(true); + m_main_pipeline->pull_awaiting_requests(); + + // generate candidates by draft model + ManualTimer draft_timer("speculative_decoding: draft_model: multistep()"); + draft_timer.start(); + m_draft_pipeline->multistep(); + draft_timer.end(); + m_sd_metrics.draft_duration += draft_timer.get_duration(); + m_pipeline_metrics = m_main_pipeline->get_metrics(); + + // to generate num_matches statistic + std::map update_sequence_info; + // put candidates to model KV cache + auto draft_generated_requests = m_draft_pipeline->get_generated_requests(); + for (const auto& candidate : m_draft_pipeline->get_generated_requests()) { + auto update_result = m_main_pipeline->update_main_request(candidate.first, candidate.second); + update_sequence_info.insert({{candidate.first, update_result}}); + } + + ManualTimer main_timer("speculative_decoding: main_model: step()"); + main_timer.start(); + m_main_pipeline->step(); + main_timer.end(); + m_sd_metrics.main_duration += main_timer.get_duration(); + m_pipeline_metrics = m_main_pipeline->get_metrics(); + // get logits and last hidden layer + auto main_generated_requests = + m_main_pipeline->get_generated_requests(); // feature extraction is enabled in main pipeline + + for (const auto& checked_sequence : main_generated_requests) { + auto update_result = m_draft_pipeline->update_draft_request(checked_sequence.first, checked_sequence.second); + update_sequence_info[checked_sequence.first].removed_tokens_cnt = update_result.removed_tokens_cnt; + } + + // finish draft request if the generation was completed + for (const auto& draft_request : draft_generated_requests) { + auto request_id = draft_request.first; + if (!main_generated_requests.count(request_id)) { + m_draft_pipeline->finish_request(request_id); + // remove draft_generation_handle from queue + m_draft_generations.erase(request_id); + } + auto updated_seq_info = update_sequence_info[request_id]; + // several prompt phase + if (updated_seq_info.inserted_tokens_cnt == 0) { + continue; + } + float acceptance_rate = + 1 - static_cast(updated_seq_info.removed_tokens_cnt) / updated_seq_info.inserted_tokens_cnt; + m_sd_metrics.update_acceptance_rate(request_id, acceptance_rate * 100); + m_sd_metrics.update_draft_accepted_tokens( + request_id, + (updated_seq_info.inserted_tokens_cnt - updated_seq_info.removed_tokens_cnt)); + } + + step_timer.end(); + + // update perf metrics + const auto num_generated_tokens = m_main_pipeline->get_processed_tokens_per_iteration(); + if (num_generated_tokens > 0) { + auto infer_duration = step_timer.get_duration_microsec(); + + raw_perf_counters.m_token_infer_durations.emplace_back(infer_duration); + raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_duration); + raw_perf_counters.m_new_token_times.emplace_back(main_timer.get_end_time()); + raw_perf_counters.m_batch_sizes.emplace_back(num_generated_tokens); + + auto main_model_gen_duration = main_timer.get_duration_microsec(); + auto m_main_pipeline_metrics = m_main_pipeline->get_metrics(); + main_raw_perf_counters.m_durations.push_back(MicroSeconds(main_model_gen_duration)); + main_raw_perf_counters.m_inference_durations[0] = MicroSeconds(m_main_pipeline_metrics.inference_duration); + main_raw_perf_counters.m_batch_sizes.push_back(num_generated_tokens); // or should be processed + generated + m_sd_metrics.update_generated_len(num_generated_tokens); + } + + if (main_generated_requests.empty() && utils::env_setup_for_print_debug_info()) { + m_sd_metrics.print(true); + m_sd_metrics.clean_up(); + } +} + +ov::Tensor ContinuousBatchingPipeline::EagleDecodingImpl::update_main_input_ids(const ov::Tensor& original_input_ids) { + auto shape = original_input_ids.get_shape(); + if (shape.size() == 0 || shape.back() <= 1) { + return ov::Tensor(original_input_ids); + } + + size_t original_length = shape.back(); + size_t new_length = original_length + 1; + + ov::Tensor draft_input_ids(ov::element::i64, {1, new_length}); + + const int64_t* src_data = original_input_ids.data(); + int64_t* dst_data = draft_input_ids.data(); + dst_data[0] = m_tokenizer.get_bos_token_id(); // add BOS token at the beginning + std::copy(src_data, src_data + original_length, dst_data + 1); + + return draft_input_ids; +} + +ov::Tensor ContinuousBatchingPipeline::EagleDecodingImpl::create_draft_input_ids(const ov::Tensor& original_input_ids) { + auto shape = original_input_ids.get_shape(); + if (shape.size() == 0 || shape.back() <= 1) { + return ov::Tensor(original_input_ids); + } + + size_t original_length = shape.back(); + size_t new_length = original_length - 1; + + ov::Tensor draft_input_ids(ov::element::i64, {1, new_length}); + + const int64_t* src_data = original_input_ids.data(); + int64_t* dst_data = draft_input_ids.data(); + + std::copy(src_data + 1, src_data + original_length, dst_data); + + return draft_input_ids; +} + +std::vector ContinuousBatchingPipeline::EagleDecodingImpl::generate( + const std::vector& input_ids, + const std::vector& sampling_params, + const StreamerVariant& streamer, + std::optional> token_type_ids) { + m_perf_metrics = ov::genai::SDPerModelsPerfMetrics(); + m_draft_pipeline->raw_perf_metrics.m_inference_durations = {{ MicroSeconds(0.0f) }}; + + OPENVINO_ASSERT(!has_non_finished_requests(), + "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use " + "ContinuousBatchingPipeline::add_request"); + + OPENVINO_ASSERT(input_ids.size() == sampling_params.size()); + + ManualTimer generate_timer("speculative_decoding: generate()"); + generate_timer.start(); + + // checks that all requests has the same LoRA adapters property value + for (size_t i = 1; i < sampling_params.size(); ++i) { + OPENVINO_ASSERT(sampling_params[i - 1].adapters == sampling_params[i].adapters, + "LoRA adapters value must be the same for all requests"); + } + m_main_pipeline->set_adapters(sampling_params[0].adapters); + m_main_pipeline->set_hidden_state_export_needed(true); + m_draft_pipeline->set_adapters(sampling_params[0].adapters); + m_draft_pipeline->set_hidden_state_export_needed(true); + m_draft_pipeline->set_hidden_state_import_needed(true); + m_draft_pipeline->set_hidden_state_internal_needed(true); + + const auto streamer_ptr = std::make_shared(streamer, m_tokenizer); + + OPENVINO_ASSERT( + !streamer_ptr->has_callback() || input_ids.size() == 1 && (sampling_params[0].is_greedy_decoding() || + (sampling_params[0].is_multinomial() && + sampling_params[0].num_return_sequences == 1) || + sampling_params[0].is_eagle_tree()), + "Currently eagle streaming is possible only with batch size=1 and only for greedy or multinomial decoding"); + + std::vector main_generations; + ov::Tensor new_input_ids; + for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) { + auto new_input_ids = input_ids[request_id]; //update_main_input_ids(input_ids[request_id]); + OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch."); + auto main_sampling_params = sampling_params[request_id]; + // clear eagle tree parameters for main pipeline + if (main_sampling_params.is_eagle_tree()) { + main_sampling_params.eagle_tree_params = {}; + } + main_generations.push_back( + m_main_pipeline->add_request(request_id, new_input_ids, main_sampling_params)); + + auto draft_sampling_params = sampling_params[request_id]; + // set the parameters do not stop draft generation without stopping of the same request for main pipeline + draft_sampling_params.ignore_eos = true; + draft_sampling_params.stop_strings = {}; + + // remove first token from input_ids to create draft_input_ids + ov::Tensor draft_input_ids = create_draft_input_ids(new_input_ids); + + std::lock_guard lock(m_draft_generations_mutex); + m_draft_generations.insert( + {request_id, m_draft_pipeline->add_request(request_id, draft_input_ids, draft_sampling_params)}); + } + auto all_requests = get_awaiting_requests(); + + GenerationHandle& generation = main_generations.at(0); + + streamer_ptr->start(); + + while (has_non_finished_requests()) { + try { + step(); + } catch (...) { + drop_requests(); // remove all requests from pipeline state in case of exception + streamer_ptr->end(); + std::rethrow_exception(std::current_exception()); + } + stream_tokens(streamer_ptr, generation); + } + + // waiting for competion of streaming + streamer_ptr->end(); + + OPENVINO_ASSERT(is_requests_empty(), + "Internal error: current request is supposed to be dropped within step() function as completed"); + + std::vector results; + results.reserve(all_requests.size()); + + m_perf_metrics.draft_model_metrics.raw_metrics = m_draft_pipeline->raw_perf_metrics; + generate_timer.end(); + + for (size_t request_id = 0; request_id < all_requests.size(); ++request_id) { + const auto& request = all_requests[request_id]; + auto sampling_params = request->get_sampling_parameters(); + const auto& sequences = request->get_finished_sequences(); + m_draft_pipeline->clear_sampler_top_k_selector(request->get_request_id()); + size_t num_outputs = std::min(sampling_params.num_return_sequences, sequences.size()); + + EncodedGenerationResult result; + result.m_request_id = request_id; + result.m_generation_ids.resize(num_outputs); + result.m_scores.resize(num_outputs); + result.m_status = request->get_generation_stream()->get_status(); + + for (size_t i = 0; i < num_outputs; ++i) { + const auto& sequence = sequences[i]; + const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) + : sequence->get_cumulative_log_prob(); + const auto& generated_ids = sequence->get_generated_ids(); + + if (sampling_params.echo) { + result.m_generation_ids[i] = request->get_prompt_ids(); + } + std::copy(generated_ids.begin(), generated_ids.end(), std::back_inserter(result.m_generation_ids[i])); + result.m_scores[i] = score; + } + + result.m_status = main_generations[request_id]->get_status(); + + // The same perf metrics for each sequence, only tokenization/detokenization will differ. + m_perf_metrics.raw_metrics.generate_durations.clear(); + m_perf_metrics.raw_metrics.generate_durations.emplace_back(generate_timer.get_duration_microsec()); + m_perf_metrics.num_input_tokens = request->get_prompt_len(); + m_perf_metrics.evaluate_statistics(generate_timer.get_start_time()); + + result.perf_metrics = m_perf_metrics; + result.extended_perf_metrics = std::make_shared(m_perf_metrics); + results.push_back(std::move(result)); + } + + OPENVINO_ASSERT(results.size() == input_ids.size()); + return results; +} + +SpeculativeDecodingMetrics ContinuousBatchingPipeline::EagleDecodingImpl::get_speculative_decoding_metrics() { + return m_sd_metrics; +}; + +void ContinuousBatchingPipeline::EagleDecodingImpl::drop_requests() { + m_draft_pipeline->finish_request(); + m_main_pipeline->finish_request(); +} + +bool ContinuousBatchingPipeline::EagleDecodingImpl::is_requests_empty() { + return m_main_pipeline->is_requests_empty() && m_draft_pipeline->is_requests_empty(); +} + +std::vector ContinuousBatchingPipeline::EagleDecodingImpl::get_awaiting_requests() { + auto main_awaiting_requests = m_main_pipeline->get_awaiting_requests(); + auto draft_awaiting_requests = m_draft_pipeline->get_awaiting_requests(); + OPENVINO_ASSERT(main_awaiting_requests.size() == draft_awaiting_requests.size()); + return main_awaiting_requests; } +} // namespace ov::genai diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp index 026d592569..06ff76187a 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp @@ -9,6 +9,19 @@ #include "speculative_decoding/speculative_decoding_metrics.hpp" #include "openvino/genai/speculative_decoding/perf_metrics.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/result.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/pass/pattern/matcher.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/manager.hpp" + namespace ov::genai { struct ModelDesc { @@ -74,4 +87,113 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat SpeculativeDecodingMetrics get_speculative_decoding_metrics(); }; +class ContinuousBatchingPipeline::EagleDecodingImpl : public ContinuousBatchingPipeline::IContinuousBatchingPipeline { +protected: + std::shared_ptr m_main_pipeline, m_draft_pipeline; // bell: see if we can reuse this class impl for eagle pipelines + // Metrics + SpeculativeDecodingMetrics m_sd_metrics; + ov::genai::SDPerModelsPerfMetrics m_perf_metrics; + ov::Tensor hiddenstates_tensor; // Tensor to store hidden states for draft model + // Mutex protecting access to m_draft_generations, so add_request and step methods can be called from different threads + std::mutex m_draft_generations_mutex; + std::map m_draft_generations; + + void drop_requests(); + void initialize_tree(); + bool is_requests_empty(); + std::vector get_awaiting_requests(); + ov::Tensor create_draft_input_ids(const ov::Tensor& original_input_ids); + ov::Tensor update_main_input_ids(const ov::Tensor& original_input_ids); + std::string m_eagle_version; + +public: + EagleDecodingImpl(const ov::genai::ModelDesc& main_model_desc, const ov::genai::ModelDesc& draft_model_desc, const std::string& eagle_version); + + GenerationHandle add_request(uint64_t request_id, + const ov::Tensor& input_ids, + ov::genai::GenerationConfig sampling_params, + std::optional token_type_ids = std::nullopt) override; + GenerationHandle add_request(uint64_t request_id, + const std::string& prompt, + ov::genai::GenerationConfig sampling_params) override; + + bool has_non_finished_requests() override; + + void fill_hidden_states(const ov::Tensor& hidden_states) { + hiddenstates_tensor = hidden_states; + } + void set_d2t_for_draft_decoding(std::shared_ptr& d2t_tensor) { + m_draft_pipeline->set_d2t_for_draft_decoding(d2t_tensor); + }; + void step() override; + + std::vector + generate(const std::vector& input_ids, + const std::vector& sampling_params, + const StreamerVariant& streamer, + std::optional> token_type_ids = std::nullopt) override; + + SpeculativeDecodingMetrics get_speculative_decoding_metrics(); +}; + +using NodePtr = std::shared_ptr; +using namespace ov::op; + +class EagleBaseTransform : public ov::pass::MatcherPass { +public: + using NodePtr = std::shared_ptr; + OPENVINO_MATCHER_PASS_RTTI("EagleBaseTransform"); + EagleBaseTransform(std::vector>& params, std::vector>& results, const std::string& eagle_version = "EAGLE3"); + + ~EagleBaseTransform() = default; + +private: + bool apply(NodePtr node, std::vector>& params, std::vector>& results); + size_t applied = 0; + std::string m_eagle_version; + std::shared_ptr find_last_hidden_node(const std::shared_ptr& start_node); + std::shared_ptr find_last_hidden_node(const std::shared_ptr& start_node, + std::set& visited_nodes); + std::shared_ptr find_last_residual_node(const std::shared_ptr& start_node); + std::shared_ptr find_last_residual_node(const std::shared_ptr& start_node, + std::set& visited_nodes); +}; +class EagleInputTransform : public ov::pass::MatcherPass { // eagle3 specific for draft model +public: + using NodePtr = std::shared_ptr; + OPENVINO_MATCHER_PASS_RTTI("EagleInputTransform"); + EagleInputTransform(std::vector>& params); + + ~EagleInputTransform() = default; + +private: + bool apply(NodePtr node, std::vector>& params); + size_t applied = 0; +}; +class Eagle3Transform : public ov::pass::MatcherPass { +public: + using NodePtr = std::shared_ptr; + OPENVINO_MATCHER_PASS_RTTI("Eagle3Transform"); + Eagle3Transform(const std::vector& layers, std::vector>& hidden_state_outputs); + + ~Eagle3Transform() = default; + +private: + bool apply(NodePtr node, std::vector>& hidden_state_outputs); + size_t applied = 0; + std::vector m_layers; // layers to be abstracted +}; + +class EagleModelTransform : public ov::pass::ModelPass { +public: + EagleModelTransform(const std::vector& layer_ids, const std::string& eagle_version = "EAGLE3"); + bool run_on_model(const std::shared_ptr& model) override; + +private: + const std::vector m_layer_ids; + std::string m_eagle_version; + std::vector> m_new_results; + std::vector> m_new_parameters; + std::vector> m_hidden_layer_outputs; +}; } diff --git a/src/cpp/src/speculative_decoding/update_request_structs.hpp b/src/cpp/src/speculative_decoding/update_request_structs.hpp index 68f79268f5..35d9c1bc0d 100644 --- a/src/cpp/src/speculative_decoding/update_request_structs.hpp +++ b/src/cpp/src/speculative_decoding/update_request_structs.hpp @@ -5,7 +5,13 @@ #include #include - +#include +#include +#include +#include +#include +#include +#include namespace ov::genai { struct GeneratedSequence { std::vector token_ids; @@ -30,4 +36,17 @@ using GeneratedSequences = std::map; // { request_id : generated_sequence } using GeneratedRequests = std::map; +class EagleGeneratedSequence : public GeneratedSequence { +public: + ov::Tensor feature_vector; + + EagleGeneratedSequence(const std::vector& generated_token_ids, + const std::vector& generated_log_probs, + const ov::Tensor& generated_hidden_states) : + GeneratedSequence(generated_token_ids, generated_log_probs), + feature_vector(generated_hidden_states) {}; +}; + +using EagleGeneratedSequences = std::map; +using EagleGeneratedRequests = std::map; } diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 35e4ad3552..7479c0b8ab 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -10,7 +10,6 @@ #include "openvino/genai/llm_pipeline.hpp" #include "openvino/genai/visual_language/pipeline.hpp" #include "openvino/runtime/core.hpp" - #include "openvino/genai/generation_handle.hpp" #include "visual_language/processor_config.hpp" @@ -78,6 +77,7 @@ void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& const std::string STREAMER_ARG_NAME = "streamer"; const std::string CONFIG_ARG_NAME = "generation_config"; const std::string DRAFT_MODEL_ARG_NAME = "draft_model"; +const std::string EAGLE_MODE = "eagle_mode"; template Config from_config_json_if_exists(const std::filesystem::path& models_path, const char config_name[] = "generation_config.json") { diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index 98e288835b..db58fd4a3e 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -247,6 +247,13 @@ void init_generation_config(py::module_& m) { } ); + py::class_(m, "Eagle_PARAMS", "eagle tree parameters for assisting generation") + .def(py::init<>()) + .def_readwrite("branching_factor", &GenerationConfig::eagle_params::branching_factor) + .def_readwrite("tree_depth", &GenerationConfig::eagle_params::tree_depth) + .def_readwrite("total_tokens", &GenerationConfig::eagle_params::total_tokens); + + // Binding for GenerationConfig py::class_(m, "GenerationConfig", generation_config_docstring) .def(py::init(), py::arg("json_path"), "path where generation_config.json is stored") @@ -276,6 +283,7 @@ void init_generation_config(py::module_& m) { .def_readwrite("logprobs", &GenerationConfig::logprobs) .def_readwrite("assistant_confidence_threshold", &GenerationConfig::assistant_confidence_threshold) .def_readwrite("num_assistant_tokens", &GenerationConfig::num_assistant_tokens) + .def_readwrite("eagle_config", &GenerationConfig::eagle_tree_params, "Eagle tree parameters for assisting generation") .def_readwrite("max_ngram_size", &GenerationConfig::max_ngram_size) .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output) .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids) diff --git a/tests/cpp/sampler_eagle_validate.cpp b/tests/cpp/sampler_eagle_validate.cpp new file mode 100644 index 0000000000..9f3737c03e --- /dev/null +++ b/tests/cpp/sampler_eagle_validate.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "sampling/sampler.hpp" +#include "openvino/genai/generation_config.hpp" + + +using namespace ov::genai; + +// main pipeline greedy sampling +// eagle pipeline beam seach for top k tree +TEST(SamplerValidationMode, eagle2_mode_initial) { + auto sampling_config = ov::genai::greedy(); + + // create sequence group with prompt [0, 1, 2, 3, 4] + std::vector input_vector{0, 1, 2, 3, 4}; + ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data()); + std::vector sequence_groups{ + SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)), + }; + + // to emulate processed prompt and add next token [ 0 ] + sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); + sequence_groups.front()->update_processed_tokens_num(5); + + // append candidates, 3 branchs from draft model, a) seq group id 0: [ 2, 3, 4 ] b) seq group id: 3 [2, 3] c) seq group id : 5 [2, 1, 4] + // simulate the generated results from the draft model, where each branch has its own sequence id + std::map> branches_to_validate = { + {0, {2, 3, 4}}, + {3, {2, 3}}, + {5, {2, 1, 4}} + }; + for (const auto& branch : branches_to_validate) { + if (branch.first == sequence_groups.front()->get_sequences().front()->get_grouped_id()) { + // reuse the first sequence + for (auto iter : branch.second) { + sequence_groups.front()->get_sequences().front()->append_token(iter, 1.f); + } + //sequence_groups.front()->get_sequences().front()->set_num_validated_tokens(branch.second.size()); + } else { + // other branches are created as new sequences + auto sequence = Sequence::create(branch.first, SequenceGroupType::TOKENS, 0); + //sequence->set_grouped_id(branch.first); + sequence->append_token(0, 1.f); // also append the first token + for (auto iter : branch.second) { + sequence->append_token(iter, 1.f); + } + //sequence->set_num_validated_tokens(branch.second.size()); + sequence_groups.front()->add_sequence(sequence); + } + } + sequence_groups.front()->set_num_validated_tokens(8); + const auto num_scheduled_tokens = sequence_groups.front()->get_num_available_tokens_for_batching(); + sequence_groups.front()->schedule_tokens(num_scheduled_tokens); + /* + std::vector num_validated_tokens = 3; + for (size_t i = 1; i <= num_validated_tokens; ++i) { + sequence_groups.front()->get_sequences().front()->append_token(i + 1, 1.f); + } + + // generated sequence [0, 1, 2, 3, 4] -> [0, 2, 3, 4] + sequence_groups.front()->set_num_validated_tokens(num_validated_tokens); + const auto num_scheduled_tokens = sequence_groups.front()->get_num_available_tokens_for_batching(); + ASSERT_EQ(num_scheduled_tokens, num_validated_tokens + 1); + sequence_groups.front()->schedule_tokens(num_scheduled_tokens); + */ + //create ref tensor : to generate candidates + next token + /*std::vector logits = {{0, 1.f, 0, 0, 0, 0, 0, 1.f, 0, 0, 0, 0, 0, 1.f, 0, 0, 0, 0, 0, 1.f}, + {0, 1.f, 0, 0, 0, 0, 0, 1.f, 0, 0, 0, 0, 0, 1.f, 0}, + {0, 1.f, 0, 0, 0, 0, 0, 1.f, 0, 0, 0, 0, 0, 1.f, 0, 0, 0, 0, 0, 1.f}}; + + // shape 4 tokens + 3 batch + 5 vocab + ov::Tensor gen_input_ids(ov::element::f32, ov::Shape{4, 1, 5}, logits.data()); + + Sampler sampler; + sampler.sample(sequence_groups, gen_input_ids, true); + + TokenIds actual = sequence_groups.front()->get_sequences().front()->get_generated_ids(), + expected{0, 1}; + ASSERT_EQ(sequence_groups.front()->get_sequences().front()->get_generated_ids(), expected);*/ +} \ No newline at end of file diff --git a/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp b/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp index 55acaf1269..531743045a 100644 --- a/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp +++ b/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) try { const bool dynamic_split_fuse = result["dynamic_split_fuse"].as(); const std::string models_path = result["model"].as(); const std::string device = result["device"].as(); - const bool use_prefix = result["use_prefix"].as(); + bool use_prefix = result["use_prefix"].as(); std::string prefix_str = "You are an advanced language model designed to assist users by providing accurate, " @@ -53,7 +53,7 @@ int main(int argc, char* argv[]) try { // create dataset std::vector prompt_examples = { - "What is OpenVINO?", + "what is openvino", "How are you?", "What is your name?", "Tell me something about Canada", @@ -61,18 +61,19 @@ int main(int argc, char* argv[]) try { }; std::vector sampling_params_examples { - ov::genai::beam_search(), - ov::genai::greedy(), + //ov::genai::beam_search(), + //ov::genai::greedy(), ov::genai::multinomial(), }; std::vector prompts(num_prompts); std::vector sampling_params(num_prompts); - + use_prefix = false; for (size_t request_id = 0; request_id < num_prompts; ++request_id) { prompts[request_id] = use_prefix ? prefix_str + prompt_examples[request_id % prompt_examples.size()] : prompt_examples[request_id % prompt_examples.size()]; sampling_params[request_id] = sampling_params_examples[request_id % sampling_params_examples.size()]; + sampling_params[request_id].num_return_sequences = 1; // only support 1 } ov::genai::SchedulerConfig scheduler_config; @@ -83,7 +84,7 @@ int main(int argc, char* argv[]) try { // mode - vLLM or dynamic_split_fuse scheduler_config.dynamic_split_fuse = dynamic_split_fuse; // vLLM specific params - scheduler_config.max_num_seqs = 2; + scheduler_config.max_num_seqs = 3; scheduler_config.enable_prefix_caching = use_prefix; // It's possible to construct a Tokenizer from a different path. diff --git a/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp b/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp index 0b0797ddf9..c41f0a95f7 100644 --- a/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp +++ b/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp @@ -20,7 +20,7 @@ std::vector get_spec_decoding_generation_config_exa generation_config_greedy_constant.num_assistant_tokens = 5; } - ov::genai::GenerationConfig generation_config_multinomial_constant = ov::genai::multinomial(); + ov::genai::GenerationConfig generation_config_multinomial_constant = ov::genai::greedy(); { generation_config_multinomial_constant.num_assistant_tokens = 5; generation_config_multinomial_constant.num_return_sequences = 1; @@ -31,7 +31,7 @@ std::vector get_spec_decoding_generation_config_exa generation_config_greedy_dynamic.assistant_confidence_threshold = 0.8f; } - ov::genai::GenerationConfig generation_config_multinomial_dynamic = ov::genai::multinomial(); + ov::genai::GenerationConfig generation_config_multinomial_dynamic = ov::genai::greedy(); { generation_config_multinomial_dynamic.assistant_confidence_threshold = 0.8f; } @@ -40,7 +40,7 @@ std::vector get_spec_decoding_generation_config_exa generation_config_greedy_constant, generation_config_multinomial_constant, generation_config_greedy_dynamic, - generation_config_multinomial_dynamic, + //generation_config_multinomial_dynamic, }; } @@ -104,7 +104,7 @@ int main(int argc, char* argv[]) try { // mode - vLLM or dynamic_split_fuse scheduler_config.dynamic_split_fuse = dynamic_split_fuse; // vLLM specific params - scheduler_config.max_num_seqs = 2; + scheduler_config.max_num_seqs = 3; ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, {ov::genai::draft_model(draft_models_path, device)}); std::vector generation_results = pipe.generate(prompts, generation_config); diff --git a/tools/llm_bench/README_EAGLE3.md b/tools/llm_bench/README_EAGLE3.md new file mode 100755 index 0000000000..197d1ff71c --- /dev/null +++ b/tools/llm_bench/README_EAGLE3.md @@ -0,0 +1,32 @@ +# SPEULATIVE DECODING for EAGLE3 + +### 1. Prepare Python Virtual Environment for LLM Benchmarking + +``` bash +python3 -m venv ov-llm-bench-env +source ov-llm-bench-env/bin/activate +pip install --upgrade pip + +git clone https://github.com/openvinotoolkit/openvino.genai.git +cd openvino.genai/tools/llm_bench +pip install -r requirements.txt +``` + +### 2. Get main and draft model in OpenVINO IR Format +the main and draft model downloaded from hugging face needs to be converted to openvino IR format. +For now, please get llama3 8B eagle3 main and draft model from below server (password: openvino): +``` bash +scp -r openvino-ci-97@10.67.108.171:~/bell/speculative_decoding/eagle3/llama-3.1-8b-instruct-ov-int4/ your_path_to_main/ +scp -r openvino-ci-97@10.67.108.171:~/bell/speculative_decoding/eagle3/EAGLE3-LLaMA3.1-instruct-8B-ov-int4/ your_path_to_draft/ +``` + +### 3. Benchmark LLM Model using eagle3 speculative decoding + +To benchmark the performance of the LLM, use the following command: + +python benchmark.py -m /home/openvino-ci-97/bell/speculative_decoding/eagle3/llama-3.1-8b-instruct-ov-int4 -d GPU -pf /home/openvino-ci-97/bell/openvino.genai/tools/llm_bench/test.jsonl -ic 129 --draft_model /home/openvino-ci-97/bell/speculative_decoding/eagle3/EAGLE3-LLaMA3.1-instruct-8B-ov-int4 --draft_device GPU --eagle_config ./eagle.config --disable_prompt_permutation --apply_chat_template + +the content of eagle.config is as below: +{"eagle_mode":"EAGLE3", "branching_factor": 1, "tree_depth": 4, "total_tokens": 6} + +to tune for better performance, fix the branching_factor to 1, adjust the tree_depth, and the total_tokens should be set as tree_depth + 2 for now (for example, in above config, ajust tree_depth to 5, and total_tokens set to 7) diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py index 36ab082558..7acf1a20df 100644 --- a/tools/llm_bench/benchmark.py +++ b/tools/llm_bench/benchmark.py @@ -164,6 +164,8 @@ def get_argprser(): parser.add_argument("--draft_device", required=False, default=None, help="Inference device for Speculative decoding of draft model") parser.add_argument("--draft_cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings or dict for Speculative decoding of draft model") + parser.add_argument("--eagle_config", required=False, default=None, + help="Path to file with eagle3 settings or dict for Speculative decoding of draft model") parser.add_argument("--num_assistant_tokens", required=False, default=None, help="Config option num_assistant_tokens for Speculative decoding and Prompt Lookup decoding", type=int) parser.add_argument("--assistant_confidence_threshold", required=False, default=None, diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py index 6658b7d8fc..ac44d95b14 100644 --- a/tools/llm_bench/llm_bench_utils/model_utils.py +++ b/tools/llm_bench/llm_bench_utils/model_utils.py @@ -214,7 +214,10 @@ def analyze_args(args): model_args['num_assistant_tokens'] = args.num_assistant_tokens model_args['assistant_confidence_threshold'] = args.assistant_confidence_threshold model_args['max_ngram_size'] = args.max_ngram_size - + eagle_config = None + if args.eagle_config: + eagle_config = get_config(args.eagle_config) + model_args['eagle_config'] = eagle_config model_args['speaker_embeddings'] = None if args.speaker_embeddings: model_args['speaker_embeddings'] = get_speaker_embeddings(args.speaker_embeddings) diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py index 9fdb82567e..271081e2e7 100644 --- a/tools/llm_bench/llm_bench_utils/ov_utils.py +++ b/tools/llm_bench/llm_bench_utils/ov_utils.py @@ -226,6 +226,7 @@ def create_genai_text_gen_model(model_path, device, ov_config, memory_monitor, * config = {} draft_model_path = kwargs.get("draft_model", '') cb_config = kwargs.get("cb_config") + eagle_config = kwargs.get("eagle_config", None) use_streamer_metrics = False if cb_config is not None: config["scheduler_config"] = get_scheduler_config_genai(cb_config) @@ -240,6 +241,10 @@ def create_genai_text_gen_model(model_path, device, ov_config, memory_monitor, * draft_model_load_kwargs = {'scheduler_config': get_scheduler_config_genai(kwargs.get("draft_cb_config"), config_name="draft CB config")}\ if kwargs.get("draft_cb_config") is not None else {} config['draft_model'] = openvino_genai.draft_model(draft_model_path, draft_device.upper(), **draft_model_load_kwargs) + if eagle_config: + log.info("eagle decoding is activated, if not specified, default to eagle3") + eagle_mode = kwargs.get("eagle_mode", "EAGLE3") + config['eagle_mode'] = eagle_mode if kwargs.get('max_ngram_size') and kwargs.get('num_assistant_tokens'): log.info("Prompt Lookup decoding is activated") diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 611464b586..e4ad8d016d 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -284,7 +284,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data ) tokenization_start = time.perf_counter() - input_data = tokenizer.encode(input_text_list) + input_data = tokenizer.encode(input_text_list, add_special_tokens = False) tokenization_end = time.perf_counter() tokenization_time = [(tokenization_end - tokenization_start) * 1000] @@ -339,6 +339,13 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data if args.get('assistant_confidence_threshold', None): gen_config.assistant_confidence_threshold = float(args['assistant_confidence_threshold']) config_info += f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}" + if args.get("eagle_config", None): + gen_config.eagle_config.branching_factor = args.get("eagle_config").get("branching_factor") + config_info += f" -- eagle3 branching_factor {gen_config.eagle_config.branching_factor}" + gen_config.eagle_config.tree_depth = args.get("eagle_config").get("tree_depth") + config_info += f" tree_depth {gen_config.eagle_config.tree_depth}" + gen_config.eagle_config.total_tokens = args.get("eagle_config").get("total_tokens") + config_info += f" total_tokens {gen_config.eagle_config.total_tokens}" log.info(config_info) if args.get('max_ngram_size') and args.get('num_assistant_tokens'): config_info = "Prompt Lookup decoding config: " @@ -356,7 +363,29 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data generated_text = tokenizer.decode(generated_tokens) detokenization_end = time.perf_counter() tokenization_time.append((detokenization_end - detokenization_start) * 1000) + extended_perf_metrics = generation_result.extended_perf_metrics + if (extended_perf_metrics): + main_model_metrics = extended_perf_metrics.main_model_metrics + print(f"MAIN MODEL") + print(f" Generate time: {main_model_metrics.get_generate_duration().mean:.2f} ms" ) + print(f" TTFT: {main_model_metrics.get_ttft().mean:.2f} ± {main_model_metrics.get_ttft().std:.2f} ms" ) + print(f" TTST: {main_model_metrics.get_ttst().mean:.2f} ± {main_model_metrics.get_ttst().std:.2f} ms/token") + print(f" TPOT: {main_model_metrics.get_tpot().mean:.2f} ± {main_model_metrics.get_tpot().std:.2f} ms/iteration") + print(f" AVG Latency: {main_model_metrics.get_latency().mean:.2f} ± {main_model_metrics.get_latency().std:.2f} ms/token") + print(f" Num generated token: {main_model_metrics.get_num_generated_tokens()} tokens") + print(f" Total iteration number: {len(main_model_metrics.raw_metrics.m_durations)}") + print(f" Num accepted token: {extended_perf_metrics.get_num_accepted_tokens()} tokens") + draft_model_metrics = extended_perf_metrics.draft_model_metrics + print(f"DRAFT MODEL" ) + print(f" Generate time: {draft_model_metrics.get_generate_duration().mean:.2f} ms" ) + print(f" TTFT: {draft_model_metrics.get_ttft().mean:.2f} ms") + print(f" TTST: {draft_model_metrics.get_ttst().mean:.2f} ms/token") + print(f" TPOT: {draft_model_metrics.get_tpot().mean:.2f} ± {draft_model_metrics.get_tpot().std:.2f} ms/token") + print(f" AVG Latency: {draft_model_metrics.get_latency().mean:.2f} ± {draft_model_metrics.get_latency().std:.2f} ms/iteration") + print(f" Num generated token: {draft_model_metrics.get_num_generated_tokens()} tokens") + print(f" Total iteration number: {len(draft_model_metrics.raw_metrics.m_durations)}") + print() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.stop_and_collect_data(f"{'P' + str(num) if num > 0 else 'warm-up'}_{proc_id}") max_rss_mem_consumption, max_rss_mem_increase, max_sys_mem_consumption, max_sys_mem_increase = mem_consumption.get_data() @@ -504,6 +533,14 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg if args.get("assistant_confidence_threshold", None): gen_config.assistant_confidence_threshold = float(args["assistant_confidence_threshold"]) config_info += f'assistant_confidence_threshold {args["assistant_confidence_threshold"]}' + if args.get("eagle_config", None): + gen_config.eagle_config.branching_factor = args.get("eagle_config").get("branching_factor") + config_info += f" -- eagle3 branching_factor {gen_config.eagle_config.branching_factor}" + gen_config.eagle_config.tree_depth = args.get("eagle_config").get("tree_depth") + config_info += f" tree_depth {gen_config.eagle_config.tree_depth}" + gen_config.eagle_config.total_tokens = args.get("eagle_config").get("total_tokens") + config_info += f" total_tokens {gen_config.eagle_config.total_tokens}" + log.info(config_info) log.info(config_info) if args.get('max_ngram_size') and args.get('num_assistant_tokens'): config_info = "Prompt Lookup decoding config: "