diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 8887b183c4396..ad5cae354dc6d 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -20,6 +20,7 @@ #include "core/providers/openvino/ov_interface.h" #include "core/providers/openvino/ov_versions/capability.h" #include "core/providers/openvino/qdq_transformations/qdq_stripping.h" +#include "core/providers/openvino/exceptions.h" namespace onnxruntime { namespace openvino_ep { @@ -144,15 +145,13 @@ BackendManager::BackendManager(SessionContext& session_context, subgraph_context_, shared_context_, model_stream); - } catch (const OnnxRuntimeException& ex) { - std::string exception_str = ex.what(); + } catch (const ovep_exception& ex) { +#ifndef OPENVINO_DISABLE_NPU_FALLBACK bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos && !session_context_.so_disable_cpu_ep_fallback && !subgraph_context_.is_ep_ctx_graph; -#if defined(OPENVINO_DISABLE_NPU_FALLBACK) - eligible_for_cpu_fallback = false; -#else if (eligible_for_cpu_fallback) { + std::string exception_str = ex.what(); LOGS_DEFAULT(VERBOSE) << exception_str; LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." << "Falling back to OV CPU for execution"; @@ -167,31 +166,10 @@ BackendManager::BackendManager(SessionContext& session_context, } catch (std::string const& msg) { ORT_THROW(msg); } - } + } else #endif - if (!eligible_for_cpu_fallback) { - if (device_type.find("NPU") != std::string::npos && - exception_str.find("intel_npu") != std::string::npos) { - // Handle NPU device related errors -#ifndef NDEBUG - ORT_THROW(exception_str + "\nModel needs to be recompiled\n"); -#else - std::string error_message = "UNKNOWN NPU ERROR"; - std::string error_code = "code 0x0"; - std::regex error_message_pattern(R"(\bZE_\w*\b)"); - std::regex error_code_pattern("code 0x[0-9a-fA-F]+"); - std::smatch matches; - if (std::regex_search(exception_str, matches, error_message_pattern)) { - error_message = matches[0]; - } - if (std::regex_search(exception_str, matches, error_code_pattern)) { - error_code = matches[0]; - } - throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n"); -#endif - } else { - ORT_THROW(exception_str); - } + { + throw ex; } } } diff --git a/onnxruntime/core/providers/openvino/exceptions.h b/onnxruntime/core/providers/openvino/exceptions.h new file mode 100644 index 0000000000000..0f1737ff22cad --- /dev/null +++ b/onnxruntime/core/providers/openvino/exceptions.h @@ -0,0 +1,82 @@ +// Copyright (C) Intel Corporation +// Licensed under the MIT License + +#pragma once + +#include +#include +#include + +#include "core/common/status.h" + +namespace onnxruntime { +namespace openvino_ep { + +struct ovep_exception : public std::exception { + enum class type { + compile_model, + import_model, + query_prop, + read_model, + unknown, + }; + + ovep_exception(const std::string& message, + enum class type type) : message_{message}, + type_{type}, + error_code_{ze_result_code_from_string(message)}, + error_name_{ze_result_name_from_string(message)} {} + + const char* what() const noexcept override { + return message_.data(); + } + + uint32_t get_code() const { return error_code_; } + + operator common::Status() const { + common::StatusCategory category_ort{common::ONNXRUNTIME}; + + if (type_ == type::unknown) { + return {category_ort, common::FAIL, message_}; + } + + // Newer drivers + if ((type_ == type::import_model) && + (error_code_ == 0x7800000f /* ZE_RESULT_ERROR_INVALID_NATIVE_BINARY */)) { + std::string message{error_name_ + ", code 0x" + std::to_string(error_code_) + "\nModel needs to be recompiled\n"}; + return {category_ort, common::INVALID_GRAPH, message}; + } + + std::string error_message = "Unhandled exception type: " + std::to_string(static_cast(type_)); + return {category_ort, common::FAIL, error_message}; + } + + protected: + std::string message_; + type type_{type::unknown}; + uint32_t error_code_{0}; + std::string error_name_; + + private: + uint32_t ze_result_code_from_string(const std::string& ov_exception_string) { + uint32_t error_code{0}; + std::regex error_code_pattern("code 0x([0-9a-fA-F]+)"); + std::smatch matches; + if (std::regex_search(ov_exception_string, matches, error_code_pattern)) { + std::from_chars(&(*matches[1].first), &(*matches[1].second), error_code, 16); + } + return error_code; + } + std::string ze_result_name_from_string(const std::string& ov_exception_string) { + std::string error_message = "UNKNOWN NPU ERROR"; + std::regex error_message_pattern(R"(\bZE_\w*\b)"); + std::smatch matches; + if (std::regex_search(ov_exception_string, matches, error_message_pattern)) { + error_message = matches[0]; + } + return error_message; + } +}; + +} // namespace openvino_ep +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 5c8293a213f40..69d7e31a08be3 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -12,6 +12,7 @@ #include "core/providers/openvino/onnx_ctx_model_helper.h" #include "core/providers/openvino/ov_versions/capability.h" #include "core/providers/openvino/qdq_transformations/qdq_stripping.h" +#include "core/providers/openvino/exceptions.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include "openvino/core/version.hpp" #ifdef USE_OVEP_NPU_MEMORY @@ -94,101 +95,105 @@ common::Status OpenVINOExecutionProvider::Compile( auto& logger = *GetLogger(); Status status = Status::OK(); - if (!fused_nodes.empty()) { - // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext - const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get(); - session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string(); - session_context_.onnx_opset_version = - graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); - } - - // Temporary code to read metadata before it moves to the .bin - auto& metadata = shared_context_->shared_weights.metadata; - if (session_context_.so_share_ep_contexts && metadata.empty()) { - // Metadata is always read from model location, this could be a source or epctx model - fs::path metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; - std::ifstream file(metadata_filename, std::ios::binary); - if (file) { - file >> metadata; + try { + if (!fused_nodes.empty()) { + // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext + const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get(); + session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string(); + session_context_.onnx_opset_version = + graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); } - } - struct OpenVINOEPFunctionState { - AllocateFunc allocate_func = nullptr; - DestroyFunc destroy_func = nullptr; - AllocatorHandle allocator_handle = nullptr; - BackendManager& backend_manager; - }; - - for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) { - const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; - const Node& fused_node = fused_node_graph.fused_node; - - NodeComputeInfo compute_info; - - // During backend creation, we check if user wants to use precompiled blob onnx model or the original model - // For precompiled blob, directly load the model instead of compiling the model - // For original model, check if the user wants to export a model with pre-compiled blob - - auto& backend_manager = backend_managers_.emplace_back(session_context_, - *shared_context_, - fused_node, - graph_body_viewer, - logger, - ep_ctx_handle_); - - compute_info.create_state_func = - [&backend_manager](ComputeContext* context, FunctionState* state) { - OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{ - .allocate_func = context->allocate_func, - .destroy_func = context->release_func, - .allocator_handle = context->allocator_handle, - .backend_manager = backend_manager}; - *state = static_cast(p); - return 0; - }; - - compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) { - auto function_state = static_cast(state); - try { - function_state->backend_manager.Compute(context); - } catch (const std::exception& ex) { - return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what()); + // Temporary code to read metadata before it moves to the .bin + auto& metadata = shared_context_->shared_weights.metadata; + if (session_context_.so_share_ep_contexts && metadata.empty()) { + // Metadata is always read from model location, this could be a source or epctx model + fs::path metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; + std::ifstream file(metadata_filename, std::ios::binary); + if (file) { + file >> metadata; } - return Status::OK(); + } + + struct OpenVINOEPFunctionState { + AllocateFunc allocate_func = nullptr; + DestroyFunc destroy_func = nullptr; + AllocatorHandle allocator_handle = nullptr; + BackendManager& backend_manager; }; - compute_info.release_state_func = - [](FunctionState state) { - if (state) { - OpenVINOEPFunctionState* function_state = static_cast(state); - delete function_state; - } - }; + for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) { + const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; + const Node& fused_node = fused_node_graph.fused_node; + + NodeComputeInfo compute_info; + + // During backend creation, we check if user wants to use precompiled blob onnx model or the original model + // For precompiled blob, directly load the model instead of compiling the model + // For original model, check if the user wants to export a model with pre-compiled blob + + auto& backend_manager = backend_managers_.emplace_back(session_context_, + *shared_context_, + fused_node, + graph_body_viewer, + logger, + ep_ctx_handle_); + + compute_info.create_state_func = + [&backend_manager](ComputeContext* context, FunctionState* state) { + OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{ + .allocate_func = context->allocate_func, + .destroy_func = context->release_func, + .allocator_handle = context->allocator_handle, + .backend_manager = backend_manager}; + *state = static_cast(p); + return 0; + }; + + compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) { + auto function_state = static_cast(state); + try { + function_state->backend_manager.Compute(context); + } catch (const std::exception& ex) { + return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what()); + } + return Status::OK(); + }; - node_compute_funcs.push_back(std::move(compute_info)); + compute_info.release_state_func = + [](FunctionState state) { + if (state) { + OpenVINOEPFunctionState* function_state = static_cast(state); + delete function_state; + } + }; - if (!status.IsOK()) { - break; - } - } + node_compute_funcs.push_back(std::move(compute_info)); - if (session_context_.so_share_ep_contexts) { - fs::path metadata_filename; - if (session_context_.so_context_file_path.empty()) { - metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; - } else { - metadata_filename = session_context_.so_context_file_path.parent_path() / "metadata.bin"; + if (!status.IsOK()) { + break; + } } - // Metadata is generated only for shared contexts - // If saving metadata then save it to the provided path or ose the original model path - // Multiple calls to Compile() will update the metadata and for the last call - // the resulting file will contain the aggregated content - std::ofstream file(metadata_filename, std::ios::binary); - if (file) { - file << metadata; + if (session_context_.so_share_ep_contexts) { + fs::path metadata_filename; + if (session_context_.so_context_file_path.empty()) { + metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; + } else { + metadata_filename = session_context_.so_context_file_path.parent_path() / "metadata.bin"; + } + + // Metadata is generated only for shared contexts + // If saving metadata then save it to the provided path or ose the original model path + // Multiple calls to Compile() will update the metadata and for the last call + // the resulting file will contain the aggregated content + std::ofstream file(metadata_filename, std::ios::binary); + if (file) { + file << metadata; + } } + } catch (const ovep_exception& ex) { + status = ex; } return status; diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 306fa6113b347..a816de7f856a9 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -11,18 +11,25 @@ #include "core/providers/openvino/backend_utils.h" #include "core/providers/openvino/backends/basic_backend.h" #include "core/providers/openvino/ov_stateful_patch_utils.h" +#include "core/providers/openvino/exceptions.h" namespace onnxruntime { namespace openvino_ep { -template -inline auto OvExceptionBoundary(Func &&func, std::format_string&& fmt, Args&&... args) { +template +inline auto OvExceptionBoundary(Func&& func, Args&&... args) { try { return func(); } catch (const ov::Exception& e) { - ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)) + ": " + std::string(e.what())); + const auto message = log_tag + (args + ...) + ": " + std::string(e.what()); + if constexpr (typed) { + ORT_THROW_EX(ovep_exception, message, ovep_exception::type::import_model); + } else { + ORT_THROW(message); + } } catch (...) { - ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...))); + const auto message = log_tag + (args + ...); + ORT_THROW(message); } } @@ -47,462 +54,462 @@ void printDebugInfo(const ov::CompiledModel& obj) { continue; OPENVINO_SUPPRESS_DEPRECATED_END std::cout << " " << item2.first << ": " << item2.second.as() << std::endl; + } + } + else { + std::cout << " " << cfg << ": " << prop.as() << std::endl; } - } else { - std::cout << " " << cfg << ": " << prop.as() << std::endl; } } } -} #endif -// Function to check if a given OV property is enabled -std::optional queryOVProperty(const std::string& property, const std::string& device_type) { - try { - // Get the property value - auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties); - return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end(); - } catch (const std::exception&) { - return std::nullopt; // Property not found or invalid - } -} - -std::shared_ptr OVCore::ReadModel(std::string&& model, const std::string& model_path) { - return OvExceptionBoundary([&]() { - std::istringstream modelStringStream(std::move(model)); - std::istream& modelStream = modelStringStream; - // Try to load with FrontEndManager - ov::frontend::FrontEndManager manager; - ov::frontend::FrontEnd::Ptr FE; - ov::frontend::InputModel::Ptr inputModel; - - ov::AnyVector params{&modelStream, model_path}; - - FE = manager.load_by_model(params); - if (FE) { - inputModel = FE->load(params); - return FE->convert(inputModel); - } else { - ORT_THROW(log_tag + "Unknown exception while Reading network"); + // Function to check if a given OV property is enabled + std::optional queryOVProperty(const std::string& property, const std::string& device_type) { + try { + // Get the property value + auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties); + return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end(); + } catch (const std::exception&) { + return std::nullopt; // Property not found or invalid } - }, - "Exception while Reading network"); -} - -OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr& model, - std::string& hw_target, - const ov::AnyMap& device_config) { - ov::CompiledModel compiled_model; - ov::AnyMap config = device_config; - - if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "Stateless OV Model Statistic:" << std::endl; - LogBasicModelInfo(model); - } - - bool model_status = IsStateful(model); - LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False"); - if (!model_status) { - LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl; - PatchStatefulDecoder(model); } - if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "Stateful OV Model Statistic:" << std::endl; - LogBasicModelInfo(model); + std::shared_ptr OVCore::ReadModel(std::string && model, const std::string& model_path) { + return OvExceptionBoundary([&]() { + std::istringstream modelStringStream(std::move(model)); + std::istream& modelStream = modelStringStream; + // Try to load with FrontEndManager + ov::frontend::FrontEndManager manager; + ov::frontend::FrontEnd::Ptr FE; + ov::frontend::InputModel::Ptr inputModel; + + ov::AnyVector params{&modelStream, model_path}; + + FE = manager.load_by_model(params); + if (FE) { + inputModel = FE->load(params); + return FE->convert(inputModel); + } else { + ORT_THROW(log_tag + "Unknown exception while Reading network"); + } + }, + "Exception while Reading network"); } - auto kv_pos = GetKVAxesPos(model); + OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr & model, + std::string & hw_target, + const ov::AnyMap& device_config) { + ov::CompiledModel compiled_model; + ov::AnyMap config = device_config; - if (hw_target.find("NPU") != std::string::npos) { - KVDesc kv_desc; - auto parse_genai_config = [&](const std::string& key, unsigned int default_value) { - return (config.count(key) && !config.at(key).empty() && config.at(key).as() != "0") ? config.at(key).as() : default_value; - }; - - kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len); - kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len); + if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { + std::cout << "Stateless OV Model Statistic:" << std::endl; + LogBasicModelInfo(model); + } - // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0 - if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) { - ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty"); + bool model_status = IsStateful(model); + LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False"); + if (!model_status) { + LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl; + PatchStatefulDecoder(model); } if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl; - std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl; - std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl; - std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl; + std::cout << "Stateful OV Model Statistic:" << std::endl; + LogBasicModelInfo(model); } - UpdateNPUConfig(config, kv_pos, kv_desc); - } else { - // This patches the OV IR model so that it only produces the logits required for sampling. - // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device, - // while this is here mostly to align this behavior for other devices viz. (CPU, GPU). - ApplySliceBeforeMatmulTransformation(model); - } + auto kv_pos = GetKVAxesPos(model); - LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow"; - compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config); - OVExeNetwork exe(compiled_model, hw_target, true); - return exe; -} + if (hw_target.find("NPU") != std::string::npos) { + KVDesc kv_desc; + auto parse_genai_config = [&](const std::string& key, unsigned int default_value) { + return (config.count(key) && !config.at(key).empty() && config.at(key).as() != "0") ? config.at(key).as() : default_value; + }; + + kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len); + kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len); + + // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0 + if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) { + ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty"); + } -OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_network, - std::string& hw_target, - ov::AnyMap& device_config, - bool enable_causallm, - const std::string& name) { - return OvExceptionBoundary([&]() { - OVExeNetwork exe; - if (enable_causallm) { - auto mutable_model = ie_cnn_network->clone(); - exe = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config); + if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { + std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl; + std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl; + std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl; + std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl; + } + + UpdateNPUConfig(config, kv_pos, kv_desc); } else { - auto obj = core.compile_model(ie_cnn_network, hw_target, device_config); - exe = OVExeNetwork(obj, hw_target); + // This patches the OV IR model so that it only produces the logits required for sampling. + // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device, + // while this is here mostly to align this behavior for other devices viz. (CPU, GPU). + ApplySliceBeforeMatmulTransformation(model); } + LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow"; + compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config); + OVExeNetwork exe(compiled_model, hw_target, true); + return exe; + } + + OVExeNetwork OVCore::CompileModel(std::shared_ptr & ie_cnn_network, + std::string & hw_target, + ov::AnyMap & device_config, + bool enable_causallm, + const std::string& name) { + return OvExceptionBoundary([&]() { + OVExeNetwork exe; + if (enable_causallm) { + auto mutable_model = ie_cnn_network->clone(); + exe = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config); + } else { + auto obj = core.compile_model(ie_cnn_network, hw_target, device_config); + exe = OVExeNetwork(obj, hw_target); + } + #ifndef NDEBUG - printDebugInfo(exe.Get()); + printDebugInfo(exe.Get()); #endif - return exe; - }, - "Exception while Loading Network for graph {}", name); -} + return exe; + }, + "Exception while Loading Network for graph {}", name); + } -OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, - std::string& hw_target, - ov::AnyMap& device_config, - const std::string& name) { - return OvExceptionBoundary([&]() { - ov::CompiledModel obj; + OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name) { + return OvExceptionBoundary([&]() { + ov::CompiledModel obj; - obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); + obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); #ifndef NDEBUG - printDebugInfo(obj); + printDebugInfo(obj); #endif - OVExeNetwork exe(obj, hw_target); - return exe; - }, - "Exception while Loading Network for graph {}", name); -} + OVExeNetwork exe(obj, hw_target); + return exe; + }, + "Exception while Loading Network for graph {}", name); + } -OVExeNetwork OVCore::ImportModel(std::istream& model_stream, - std::string hw_target, - const ov::AnyMap& device_config, - std::string name) { - return OvExceptionBoundary([&]() { - ov::CompiledModel obj; - obj = core.import_model(model_stream, hw_target, device_config); - OVExeNetwork exe(obj, hw_target); + OVExeNetwork OVCore::ImportModel(std::istream & model_stream, + std::string hw_target, + const ov::AnyMap& device_config, + std::string name) { + return OvExceptionBoundary([&]() { + ov::CompiledModel obj; + obj = core.import_model(model_stream, hw_target, device_config); + OVExeNetwork exe(obj, hw_target); #ifndef NDEBUG - printDebugInfo(exe.Get()); + printDebugInfo(exe.Get()); #endif - return exe; - }, - "Exception while Loading Network for graph {}", name); -} + return exe; + }, + "Exception while Loading Network for graph {}", name); + } -OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream, - std::string& hw_target, - const ov::AnyMap& device_config, - bool enable_causallm, - std::filesystem::path model_file_path) { - return OvExceptionBoundary([&]() { - OVExeNetwork exe; - - bool isXML = backend_utils::IsModelStreamXML(model_stream); - - // Helper function to check if file exists and is readable - const auto check_file_access = [&model_file_path](const std::filesystem::path& path) { - try { - if (!std::filesystem::exists(path) || std::filesystem::is_empty(path)) { - ORT_THROW(log_tag + "Required file missing or empty: " + path.string()); - } - std::ifstream file(path); - if (!file) { - ORT_THROW(log_tag + "Required file not readable: " + path.string()); + OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream & model_stream, + std::string & hw_target, + const ov::AnyMap& device_config, + bool enable_causallm, + std::filesystem::path model_file_path) { + return OvExceptionBoundary([&]() { + OVExeNetwork exe; + + bool isXML = backend_utils::IsModelStreamXML(model_stream); + + // Helper function to check if file exists and is readable + const auto check_file_access = [&model_file_path](const std::filesystem::path& path) { + try { + if (!std::filesystem::exists(path) || std::filesystem::is_empty(path)) { + ORT_THROW(log_tag + "Required file missing or empty: " + path.string()); + } + std::ifstream file(path); + if (!file) { + ORT_THROW(log_tag + "Required file not readable: " + path.string()); + } + } catch (const std::exception& e) { + ORT_THROW(log_tag + "Exception while checking file access for: " + path.string() + " - " + e.what()); } - } catch (const std::exception& e) { - ORT_THROW(log_tag + "Exception while checking file access for: " + path.string() + " - " + e.what()); - } - }; + }; - if (isXML) { - // If the model is XML, we need to load it with the XML content in read_model() - // where weights from bin file is directly consumed - auto xml_file_path = model_file_path.parent_path() / (model_file_path.stem().string() + ".xml"); + if (isXML) { + // If the model is XML, we need to load it with the XML content in read_model() + // where weights from bin file is directly consumed + auto xml_file_path = model_file_path.parent_path() / (model_file_path.stem().string() + ".xml"); - check_file_access(xml_file_path); + check_file_access(xml_file_path); - LOGS_DEFAULT(INFO) << log_tag << "Reading OVIR from XML file path: " << xml_file_path.string(); + LOGS_DEFAULT(INFO) << log_tag << "Reading OVIR from XML file path: " << xml_file_path.string(); - // Load the model explicitly with XML contents - std::shared_ptr model = core.read_model(xml_file_path.string()); + // Load the model explicitly with XML contents + std::shared_ptr model = core.read_model(xml_file_path.string()); - if (enable_causallm) { - exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config); - } else { - auto obj = core.compile_model(model, hw_target, device_config); - exe = OVExeNetwork(obj, hw_target); + if (enable_causallm) { + exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config); + } else { + auto obj = core.compile_model(model, hw_target, device_config); + exe = OVExeNetwork(obj, hw_target); + } } - } #ifndef NDEBUG - printDebugInfo(exe.Get()); + printDebugInfo(exe.Get()); #endif - return exe; - }, - "Exception while Loading Network from OVIR model file: {}", model_file_path.string()); -} - - -void OVCore::SetCache(const std::string& cache_dir_path) { - core.set_property(ov::cache_dir(cache_dir_path)); -} - -std::vector OVCore::GetAvailableDevices() const { - std::vector available_devices = core.get_available_devices(); - return available_devices; -} - -std::vector OVCore::GetAvailableDevices(const std::string& device_type) const { - std::vector available_devices; - std::vector devicesIDs; - // Uses logic from OpenVINO to only return available devices of the specified type (e.g. CPU, NPU or GPU) - try { - devicesIDs = core.get_property(device_type, ov::available_devices); - } catch (const ov::Exception&) { - // plugin is not created by e.g. invalid env - // Empty device list will be returned - } catch (const std::exception& ex) { - ORT_THROW(log_tag + "An exception occurred while trying to create the ", - device_type, - " device: ", - ex.what()); - } catch (...) { - ORT_THROW(log_tag + "Unknown exception occurred while trying to create the ", - device_type, - " device"); + return exe; + }, + "Exception while Loading Network from OVIR model file: {}", model_file_path.string()); } - if (devicesIDs.size() > 1 || - (devicesIDs.size() == 1 && devicesIDs[0] == "0")) { - for (const auto& deviceID : devicesIDs) { - available_devices.push_back(device_type + '.' + deviceID); - } + void OVCore::SetCache(const std::string& cache_dir_path) { + core.set_property(ov::cache_dir(cache_dir_path)); } - if (!devicesIDs.empty()) { - available_devices.push_back(device_type); - } - - return available_devices; -} -void OVCore::SetStreams(const std::string& device_type, int num_streams) { - core.set_property(device_type, {ov::num_streams(num_streams)}); -} + std::vector OVCore::GetAvailableDevices() const { + std::vector available_devices = core.get_available_devices(); + return available_devices; + } -std::shared_ptr OVExeNetwork::CreateInferRequest() { - return OvExceptionBoundary([&]() { - auto infReq = compiled_model_obj.create_infer_request(); - std::shared_ptr ovInfReq; - if (is_stateful_causallm) { - ovInfReq = std::make_shared(std::move(infReq), target_device); - } else { - ovInfReq = std::make_shared(std::move(infReq)); + std::vector OVCore::GetAvailableDevices(const std::string& device_type) const { + std::vector available_devices; + std::vector devicesIDs; + // Uses logic from OpenVINO to only return available devices of the specified type (e.g. CPU, NPU or GPU) + try { + devicesIDs = core.get_property(device_type, ov::available_devices); + } catch (const ov::Exception&) { + // plugin is not created by e.g. invalid env + // Empty device list will be returned + } catch (const std::exception& ex) { + ORT_THROW(log_tag + "An exception occurred while trying to create the ", + device_type, + " device: ", + ex.what()); + } catch (...) { + ORT_THROW(log_tag + "Unknown exception occurred while trying to create the ", + device_type, + " device"); } - return ovInfReq; - }, - - "Exception while creating InferRequest object"); -} -OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { - return OvExceptionBoundary([&]() { - auto tobj = ovInfReq.get_tensor(input_name); - OVTensorPtr blob = std::make_shared(tobj); - return blob; - }, - " Cannot access IE Blob for input: {}", input_name); -} + if (devicesIDs.size() > 1 || + (devicesIDs.size() == 1 && devicesIDs[0] == "0")) { + for (const auto& deviceID : devicesIDs) { + available_devices.push_back(device_type + '.' + deviceID); + } + } + if (!devicesIDs.empty()) { + available_devices.push_back(device_type); + } -std::string OVInferRequest::GetInputTensorName(uint32_t index) { - return OvExceptionBoundary([&]() { - const auto& model = ovInfReq.get_compiled_model(); - return *model.input(index).get_names().begin(); - }, - " Cannot access IE Blob for input number: {}", index); -} + return available_devices; + } -void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) { - OvExceptionBoundary([&]() { - ovInfReq.set_tensor(name, *(blob.get())); - }, - " Cannot set Remote Blob for output: {}", name); -} + void OVCore::SetStreams(const std::string& device_type, int num_streams) { + core.set_property(device_type, {ov::num_streams(num_streams)}); + } -uint32_t OVInferRequest::GetNumInputs() { - return static_cast(ovInfReq.get_compiled_model().inputs().size()); -} + std::shared_ptr OVExeNetwork::CreateInferRequest() { + return OvExceptionBoundary([&]() { + auto infReq = compiled_model_obj.create_infer_request(); + std::shared_ptr ovInfReq; + if (is_stateful_causallm) { + ovInfReq = std::make_shared(std::move(infReq), target_device); + } else { + ovInfReq = std::make_shared(std::move(infReq)); + } + return ovInfReq; + }, -void OVInferRequest::Infer() { - OvExceptionBoundary([&]() { - ovInfReq.infer(); - }, - "In Error Couldn't start Inference"); -} + "Exception while creating InferRequest object"); + } -StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) - : OVInferRequest(std::move(infer_request)), target_device(device) { - bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); - if (gpu_or_npu) { - prefill_use_full_chat_history = true; + OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { + return OvExceptionBoundary([&]() { + auto tobj = ovInfReq.get_tensor(input_name); + OVTensorPtr blob = std::make_shared(tobj); + return blob; + }, + " Cannot access IE Blob for input: {}", input_name); } -} -void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type, - const std::vector& shape, int32_t fill_value) { - ov::Tensor tensor = ov::Tensor(type, shape); - std::fill_n(tensor.data(), tensor.get_size(), fill_value); - ovInfReq.set_tensor(tensor_name, tensor); -} + std::string OVInferRequest::GetInputTensorName(uint32_t index) { + return OvExceptionBoundary([&]() { + const auto& model = ovInfReq.get_compiled_model(); + return *model.input(index).get_names().begin(); + }, + " Cannot access IE Blob for input number: {}", index); + } -void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector& cache) { - auto tensor = ovInfReq.get_tensor(tensor_name); - auto* pData = tensor.data(); - for (size_t i = 0; i < tensor.get_size(); i++) { - cache.emplace_back(pData[i]); + void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) { + OvExceptionBoundary([&]() { + ovInfReq.set_tensor(name, *(blob.get())); + }, + " Cannot set Remote Blob for output: {}", name); } -} -void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name, - const std::vector& cache_data) { - auto tensor = ovInfReq.get_tensor(tensor_name); - auto new_shape = tensor.get_shape(); - new_shape[1] = cache_data.size(); + uint32_t OVInferRequest::GetNumInputs() { + return static_cast(ovInfReq.get_compiled_model().inputs().size()); + } - auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape); - auto* pNewData = new_tensor.data(); - std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t)); + void OVInferRequest::Infer() { + OvExceptionBoundary([&]() { + ovInfReq.infer(); + }, + "In Error Couldn't start Inference"); + } - ovInfReq.set_tensor(tensor_name, new_tensor); -} + StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) + : OVInferRequest(std::move(infer_request)), target_device(device) { + bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); + if (gpu_or_npu) { + prefill_use_full_chat_history = true; + } + } -std::optional StatefulOVInferRequest::FindTensor(const std::string& tensor_name) { - // Check if tensor exists by examining input names in the compiled model - const auto& model = ovInfReq.get_compiled_model(); - bool tensor_exists = false; + void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type, + const std::vector& shape, int32_t fill_value) { + ov::Tensor tensor = ov::Tensor(type, shape); + std::fill_n(tensor.data(), tensor.get_size(), fill_value); + ovInfReq.set_tensor(tensor_name, tensor); + } - for (const auto& input : model.inputs()) { - const auto& names = input.get_names(); - if (names.find(tensor_name) != names.end()) { - tensor_exists = true; - break; + void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector& cache) { + auto tensor = ovInfReq.get_tensor(tensor_name); + auto* pData = tensor.data(); + for (size_t i = 0; i < tensor.get_size(); i++) { + cache.emplace_back(pData[i]); } } - if (tensor_exists) { - return ovInfReq.get_tensor(tensor_name); - } + void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name, + const std::vector& cache_data) { + auto tensor = ovInfReq.get_tensor(tensor_name); + auto new_shape = tensor.get_shape(); + new_shape[1] = cache_data.size(); - return std::nullopt; -} + auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape); + auto* pNewData = new_tensor.data(); + std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t)); -void StatefulOVInferRequest::PreProcessInferRequest() { - // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. - // TODO(ankit): Address this issue and implement the fix at the appropriate layer. - FillTensor("beam_idx", ov::element::i32, {1}, 0); + ovInfReq.set_tensor(tensor_name, new_tensor); + } - // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. - if (prefill_use_full_chat_history) { - auto input_ids_tensor = ovInfReq.get_tensor("input_ids"); - CacheTensor("input_ids", cached_input_ids); + std::optional StatefulOVInferRequest::FindTensor(const std::string& tensor_name) { + // Check if tensor exists by examining input names in the compiled model + const auto& model = ovInfReq.get_compiled_model(); + bool tensor_exists = false; - // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists - auto position_ids_opt = FindTensor("position_ids"); - bool has_position_ids = position_ids_opt.has_value(); + for (const auto& input : model.inputs()) { + const auto& names = input.get_names(); + if (names.find(tensor_name) != names.end()) { + tensor_exists = true; + break; + } + } - if (has_position_ids) { - CacheTensor("position_ids", cached_position_ids); + if (tensor_exists) { + return ovInfReq.get_tensor(tensor_name); } - // If we're about to run the prefill model - if (input_ids_tensor.get_size() > 1) { - // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids". - // This indicates that we are running a subsequent prompt (not the initial prefill). - if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) { - // Clear the internal KVCache state. For NPU device, this operation is a no-op. - ovInfReq.reset_state(); + return std::nullopt; + } + + void StatefulOVInferRequest::PreProcessInferRequest() { + // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. + // TODO(ankit): Address this issue and implement the fix at the appropriate layer. + FillTensor("beam_idx", ov::element::i32, {1}, 0); - // Set tensors using cached values - SetTensorFromCache("input_ids", cached_input_ids); + // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. + if (prefill_use_full_chat_history) { + auto input_ids_tensor = ovInfReq.get_tensor("input_ids"); + CacheTensor("input_ids", cached_input_ids); - // Only set position_ids if it exists and we have cached values - if (has_position_ids && !cached_position_ids.empty()) { - SetTensorFromCache("position_ids", cached_position_ids); + // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists + auto position_ids_opt = FindTensor("position_ids"); + bool has_position_ids = position_ids_opt.has_value(); + + if (has_position_ids) { + CacheTensor("position_ids", cached_position_ids); + } + + // If we're about to run the prefill model + if (input_ids_tensor.get_size() > 1) { + // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids". + // This indicates that we are running a subsequent prompt (not the initial prefill). + if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) { + // Clear the internal KVCache state. For NPU device, this operation is a no-op. + ovInfReq.reset_state(); + + // Set tensors using cached values + SetTensorFromCache("input_ids", cached_input_ids); + + // Only set position_ids if it exists and we have cached values + if (has_position_ids && !cached_position_ids.empty()) { + SetTensorFromCache("position_ids", cached_position_ids); + } } } } } -} -void StatefulOVInferRequest::Infer() { - PreProcessInferRequest(); - OVInferRequest::Infer(); -} + void StatefulOVInferRequest::Infer() { + PreProcessInferRequest(); + OVInferRequest::Infer(); + } -void StatefulOVInferRequest::RewindKVCache(size_t index) { - LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index; + void StatefulOVInferRequest::RewindKVCache(size_t index) { + LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index; - if (prefill_use_full_chat_history) { - // Clear the internal KVCache state. For NPU device, this operation is a no-op. - ovInfReq.reset_state(); + if (prefill_use_full_chat_history) { + // Clear the internal KVCache state. For NPU device, this operation is a no-op. + ovInfReq.reset_state(); - // Resize the cached "input_ids" and "position_ids" to the specified index. - if (cached_input_ids.size() > index) { - cached_input_ids.resize(index); - } + // Resize the cached "input_ids" and "position_ids" to the specified index. + if (cached_input_ids.size() > index) { + cached_input_ids.resize(index); + } - if (cached_position_ids.size() > index) { - cached_position_ids.resize(index); - } - } else { - if (index == 0) { - // In this case, since we're resetting the entire KVCache, simply reset the state. - ovInfReq.reset_state(); + if (cached_position_ids.size() > index) { + cached_position_ids.resize(index); + } } else { - // Retrieve KVCache states and trim them to the specified index. - // The following logic is adapted from: - // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329 - auto states = ovInfReq.query_state(); - for (auto& state : states) { - ov::Tensor old_tensor = state.get_state(); - // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size] - auto shape = old_tensor.get_shape(); - - if (shape[2] > index) { - // Update the sequence length dimension to the specified index. - shape[2] = index; - - ov::Coordinate new_shape_begin{0, 0, 0, 0}; - ov::Coordinate new_shape_end{shape}; - - // Create a trimmed tensor with the updated shape. - auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end); - - // Copy the trimmed tensor into a new tensor and update the state. - ov::Tensor new_tensor(old_tensor.get_element_type(), shape); - trimmed_tensor.copy_to(new_tensor); - - state.set_state(new_tensor); + if (index == 0) { + // In this case, since we're resetting the entire KVCache, simply reset the state. + ovInfReq.reset_state(); + } else { + // Retrieve KVCache states and trim them to the specified index. + // The following logic is adapted from: + // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329 + auto states = ovInfReq.query_state(); + for (auto& state : states) { + ov::Tensor old_tensor = state.get_state(); + // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size] + auto shape = old_tensor.get_shape(); + + if (shape[2] > index) { + // Update the sequence length dimension to the specified index. + shape[2] = index; + + ov::Coordinate new_shape_begin{0, 0, 0, 0}; + ov::Coordinate new_shape_end{shape}; + + // Create a trimmed tensor with the updated shape. + auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end); + + // Copy the trimmed tensor into a new tensor and update the state. + ov::Tensor new_tensor(old_tensor.get_element_type(), shape); + trimmed_tensor.copy_to(new_tensor); + + state.set_state(new_tensor); + } } } } } -} } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 45ea822685710..88ddde8610c6e 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -38,7 +38,7 @@ GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler, device_type_ = "CPU"; if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true; } else if (enable_qdq_optimizer && device_type_.find("GPU") != std::string::npos) { - npu_qdq_optimizer_enabled = true; // see data_ops.cc ~615 where we check for int16 types for gpu, this may change to a better approach later + npu_qdq_optimizer_enabled = true; // see data_ops.cc ~615 where we check for int16 types for gpu, this may change to a better approach later } #if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5 diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index b88f0d04d21f2..3dbf457c94242 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -614,7 +614,7 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { } // experimentally for GPU and qdq stripping mode allow int16 types if (npu_qdq_optimizer_enabled_ && (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)) - return true; + return true; } #ifndef NDEBUG if (openvino_ep::backend_utils::IsDebugEnabled()) {