diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 8887b183c4396..ad5cae354dc6d 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -20,6 +20,7 @@
 #include "core/providers/openvino/ov_interface.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/exceptions.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -144,15 +145,13 @@ BackendManager::BackendManager(SessionContext& session_context,
                                                       subgraph_context_,
                                                       shared_context_,
                                                       model_stream);
-    } catch (const OnnxRuntimeException& ex) {
-      std::string exception_str = ex.what();
+    } catch (const ovep_exception& ex) {
+#ifndef OPENVINO_DISABLE_NPU_FALLBACK
       bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
                                        !session_context_.so_disable_cpu_ep_fallback &&
                                        !subgraph_context_.is_ep_ctx_graph;
-#if defined(OPENVINO_DISABLE_NPU_FALLBACK)
-      eligible_for_cpu_fallback = false;
-#else
       if (eligible_for_cpu_fallback) {
+        std::string exception_str = ex.what();
         LOGS_DEFAULT(VERBOSE) << exception_str;
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                               << "Falling back to OV CPU for execution";
@@ -167,31 +166,10 @@ BackendManager::BackendManager(SessionContext& session_context,
         } catch (std::string const& msg) {
           ORT_THROW(msg);
         }
-      }
+      } else
 #endif
-      if (!eligible_for_cpu_fallback) {
-        if (device_type.find("NPU") != std::string::npos &&
-            exception_str.find("intel_npu") != std::string::npos) {
-          // Handle NPU device related errors
-#ifndef NDEBUG
-          ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
-#else
-          std::string error_message = "UNKNOWN NPU ERROR";
-          std::string error_code = "code 0x0";
-          std::regex error_message_pattern(R"(\bZE_\w*\b)");
-          std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
-          std::smatch matches;
-          if (std::regex_search(exception_str, matches, error_message_pattern)) {
-            error_message = matches[0];
-          }
-          if (std::regex_search(exception_str, matches, error_code_pattern)) {
-            error_code = matches[0];
-          }
-          throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
-#endif
-        } else {
-          ORT_THROW(exception_str);
-        }
+      {
+        throw ex;
       }
     }
   }
diff --git a/onnxruntime/core/providers/openvino/exceptions.h b/onnxruntime/core/providers/openvino/exceptions.h
new file mode 100644
index 0000000000000..0f1737ff22cad
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/exceptions.h
@@ -0,0 +1,82 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+
+#include <exception>
+#include <regex>
+#include <string>
+
+#include "core/common/status.h"
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+struct ovep_exception : public std::exception {
+  enum class type {
+    compile_model,
+    import_model,
+    query_prop,
+    read_model,
+    unknown,
+  };
+
+  ovep_exception(const std::string& message,
+                 enum class type type) : message_{message},
+                                         type_{type},
+                                         error_code_{ze_result_code_from_string(message)},
+                                         error_name_{ze_result_name_from_string(message)} {}
+
+  const char* what() const noexcept override {
+    return message_.data();
+  }
+
+  uint32_t get_code() const { return error_code_; }
+
+  operator common::Status() const {
+    common::StatusCategory category_ort{common::ONNXRUNTIME};
+
+    if (type_ == type::unknown) {
+      return {category_ort, common::FAIL, message_};
+    }
+
+    // Newer drivers
+    if ((type_ == type::import_model) &&
+        (error_code_ == 0x7800000f /* ZE_RESULT_ERROR_INVALID_NATIVE_BINARY */)) {
+      std::string message{error_name_ + ", code 0x" + std::to_string(error_code_) + "\nModel needs to be recompiled\n"};
+      return {category_ort, common::INVALID_GRAPH, message};
+    }
+
+    std::string error_message = "Unhandled exception type: " + std::to_string(static_cast<int>(type_));
+    return {category_ort, common::FAIL, error_message};
+  }
+
+ protected:
+  std::string message_;
+  type type_{type::unknown};
+  uint32_t error_code_{0};
+  std::string error_name_;
+
+ private:
+  uint32_t ze_result_code_from_string(const std::string& ov_exception_string) {
+    uint32_t error_code{0};
+    std::regex error_code_pattern("code 0x([0-9a-fA-F]+)");
+    std::smatch matches;
+    if (std::regex_search(ov_exception_string, matches, error_code_pattern)) {
+      std::from_chars(&(*matches[1].first), &(*matches[1].second), error_code, 16);
+    }
+    return error_code;
+  }
+  std::string ze_result_name_from_string(const std::string& ov_exception_string) {
+    std::string error_message = "UNKNOWN NPU ERROR";
+    std::regex error_message_pattern(R"(\bZE_\w*\b)");
+    std::smatch matches;
+    if (std::regex_search(ov_exception_string, matches, error_message_pattern)) {
+      error_message = matches[0];
+    }
+    return error_message;
+  }
+};
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 5c8293a213f40..69d7e31a08be3 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -12,6 +12,7 @@
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/exceptions.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "openvino/core/version.hpp"
 #ifdef USE_OVEP_NPU_MEMORY
@@ -94,101 +95,105 @@ common::Status OpenVINOExecutionProvider::Compile(
   auto& logger = *GetLogger();
   Status status = Status::OK();
 
-  if (!fused_nodes.empty()) {
-    // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
-    const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
-    session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
-    session_context_.onnx_opset_version =
-        graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
-  }
-
-  // Temporary code to read metadata before it moves to the .bin
-  auto& metadata = shared_context_->shared_weights.metadata;
-  if (session_context_.so_share_ep_contexts && metadata.empty()) {
-    // Metadata is always read from model location, this could be a source or epctx model
-    fs::path metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin";
-    std::ifstream file(metadata_filename, std::ios::binary);
-    if (file) {
-      file >> metadata;
+  try {
+    if (!fused_nodes.empty()) {
+      // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
+      const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
+      session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
+      session_context_.onnx_opset_version =
+          graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
     }
-  }
 
-  struct OpenVINOEPFunctionState {
-    AllocateFunc allocate_func = nullptr;
-    DestroyFunc destroy_func = nullptr;
-    AllocatorHandle allocator_handle = nullptr;
-    BackendManager& backend_manager;
-  };
-
-  for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
-    const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
-    const Node& fused_node = fused_node_graph.fused_node;
-
-    NodeComputeInfo compute_info;
-
-    // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
-    // For precompiled blob, directly load the model instead of compiling the model
-    // For original model, check if the user wants to export a model with pre-compiled blob
-
-    auto& backend_manager = backend_managers_.emplace_back(session_context_,
-                                                           *shared_context_,
-                                                           fused_node,
-                                                           graph_body_viewer,
-                                                           logger,
-                                                           ep_ctx_handle_);
-
-    compute_info.create_state_func =
-        [&backend_manager](ComputeContext* context, FunctionState* state) {
-          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{
-              .allocate_func = context->allocate_func,
-              .destroy_func = context->release_func,
-              .allocator_handle = context->allocator_handle,
-              .backend_manager = backend_manager};
-          *state = static_cast<FunctionState>(p);
-          return 0;
-        };
-
-    compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
-      auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
-      try {
-        function_state->backend_manager.Compute(context);
-      } catch (const std::exception& ex) {
-        return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
+    // Temporary code to read metadata before it moves to the .bin
+    auto& metadata = shared_context_->shared_weights.metadata;
+    if (session_context_.so_share_ep_contexts && metadata.empty()) {
+      // Metadata is always read from model location, this could be a source or epctx model
+      fs::path metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin";
+      std::ifstream file(metadata_filename, std::ios::binary);
+      if (file) {
+        file >> metadata;
       }
-      return Status::OK();
+    }
+
+    struct OpenVINOEPFunctionState {
+      AllocateFunc allocate_func = nullptr;
+      DestroyFunc destroy_func = nullptr;
+      AllocatorHandle allocator_handle = nullptr;
+      BackendManager& backend_manager;
     };
 
-    compute_info.release_state_func =
-        [](FunctionState state) {
-          if (state) {
-            OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
-            delete function_state;
-          }
-        };
+    for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
+      const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
+      const Node& fused_node = fused_node_graph.fused_node;
+
+      NodeComputeInfo compute_info;
+
+      // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
+      // For precompiled blob, directly load the model instead of compiling the model
+      // For original model, check if the user wants to export a model with pre-compiled blob
+
+      auto& backend_manager = backend_managers_.emplace_back(session_context_,
+                                                             *shared_context_,
+                                                             fused_node,
+                                                             graph_body_viewer,
+                                                             logger,
+                                                             ep_ctx_handle_);
+
+      compute_info.create_state_func =
+          [&backend_manager](ComputeContext* context, FunctionState* state) {
+            OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{
+                .allocate_func = context->allocate_func,
+                .destroy_func = context->release_func,
+                .allocator_handle = context->allocator_handle,
+                .backend_manager = backend_manager};
+            *state = static_cast<FunctionState>(p);
+            return 0;
+          };
+
+      compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
+        auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
+        try {
+          function_state->backend_manager.Compute(context);
+        } catch (const std::exception& ex) {
+          return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
+        }
+        return Status::OK();
+      };
 
-    node_compute_funcs.push_back(std::move(compute_info));
+      compute_info.release_state_func =
+          [](FunctionState state) {
+            if (state) {
+              OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
+              delete function_state;
+            }
+          };
 
-    if (!status.IsOK()) {
-      break;
-    }
-  }
+      node_compute_funcs.push_back(std::move(compute_info));
 
-  if (session_context_.so_share_ep_contexts) {
-    fs::path metadata_filename;
-    if (session_context_.so_context_file_path.empty()) {
-      metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin";
-    } else {
-      metadata_filename = session_context_.so_context_file_path.parent_path() / "metadata.bin";
+      if (!status.IsOK()) {
+        break;
+      }
     }
 
-    // Metadata is generated only for shared contexts
-    // If saving metadata then save it to the provided path or ose the original model path
-    // Multiple calls to Compile() will update the metadata and for the last call
-    //   the resulting file will contain the aggregated content
-    std::ofstream file(metadata_filename, std::ios::binary);
-    if (file) {
-      file << metadata;
+    if (session_context_.so_share_ep_contexts) {
+      fs::path metadata_filename;
+      if (session_context_.so_context_file_path.empty()) {
+        metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin";
+      } else {
+        metadata_filename = session_context_.so_context_file_path.parent_path() / "metadata.bin";
+      }
+
+      // Metadata is generated only for shared contexts
+      // If saving metadata then save it to the provided path or ose the original model path
+      // Multiple calls to Compile() will update the metadata and for the last call
+      //   the resulting file will contain the aggregated content
+      std::ofstream file(metadata_filename, std::ios::binary);
+      if (file) {
+        file << metadata;
+      }
     }
+  } catch (const ovep_exception& ex) {
+    status = ex;
   }
 
   return status;
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 306fa6113b347..a816de7f856a9 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -11,18 +11,25 @@
 #include "core/providers/openvino/backend_utils.h"
 #include "core/providers/openvino/backends/basic_backend.h"
 #include "core/providers/openvino/ov_stateful_patch_utils.h"
+#include "core/providers/openvino/exceptions.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
-template <typename Func, typename... Args>
-inline auto OvExceptionBoundary(Func &&func, std::format_string<Args...>&& fmt, Args&&... args) {
+template <bool typed, typename Func, typename... Args>
+inline auto OvExceptionBoundary(Func&& func, Args&&... args) {
   try {
     return func();
   } catch (const ov::Exception& e) {
-    ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)) + ": " + std::string(e.what()));
+    const auto message = log_tag + (args + ...) + ": " + std::string(e.what());
+    if constexpr (typed) {
+      ORT_THROW_EX(ovep_exception, message, ovep_exception::type::import_model);
+    } else {
+      ORT_THROW(message);
+    }
   } catch (...) {
-    ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)));
+    const auto message = log_tag + (args + ...);
+    ORT_THROW(message);
   }
 }
 
@@ -47,462 +54,462 @@ void printDebugInfo(const ov::CompiledModel& obj) {
               continue;
             OPENVINO_SUPPRESS_DEPRECATED_END
             std::cout << "    " << item2.first << ": " << item2.second.as<std::string>() << std::endl;
+          }
+        }
+        else {
+          std::cout << "  " << cfg << ": " << prop.as<std::string>() << std::endl;
         }
-      } else {
-        std::cout << "  " << cfg << ": " << prop.as<std::string>() << std::endl;
       }
     }
   }
-}
 #endif
 
-// Function to check if a given OV property is enabled
-std::optional<bool> queryOVProperty(const std::string& property, const std::string& device_type) {
-  try {
-    // Get the property value
-    auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties);
-    return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end();
-  } catch (const std::exception&) {
-    return std::nullopt;  // Property not found or invalid
-  }
-}
-
-std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string&& model, const std::string& model_path) {
-  return OvExceptionBoundary([&]() {
-    std::istringstream modelStringStream(std::move(model));
-    std::istream& modelStream = modelStringStream;
-    // Try to load with FrontEndManager
-    ov::frontend::FrontEndManager manager;
-    ov::frontend::FrontEnd::Ptr FE;
-    ov::frontend::InputModel::Ptr inputModel;
-
-    ov::AnyVector params{&modelStream, model_path};
-
-    FE = manager.load_by_model(params);
-    if (FE) {
-      inputModel = FE->load(params);
-      return FE->convert(inputModel);
-    } else {
-      ORT_THROW(log_tag + "Unknown exception while Reading network");
+  // Function to check if a given OV property is enabled
+  std::optional<bool> queryOVProperty(const std::string& property, const std::string& device_type) {
+    try {
+      // Get the property value
+      auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties);
+      return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end();
+    } catch (const std::exception&) {
+      return std::nullopt;  // Property not found or invalid
     }
-  },
-                             "Exception while Reading network");
-}
-
-OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
-                                          std::string& hw_target,
-                                          const ov::AnyMap& device_config) {
-  ov::CompiledModel compiled_model;
-  ov::AnyMap config = device_config;
-
-  if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-    std::cout << "Stateless OV Model Statistic:" << std::endl;
-    LogBasicModelInfo(model);
-  }
-
-  bool model_status = IsStateful(model);
-  LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False");
-  if (!model_status) {
-    LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl;
-    PatchStatefulDecoder(model);
   }
 
-  if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-    std::cout << "Stateful OV Model Statistic:" << std::endl;
-    LogBasicModelInfo(model);
+  std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string && model, const std::string& model_path) {
+    return OvExceptionBoundary<false>([&]() {
+      std::istringstream modelStringStream(std::move(model));
+      std::istream& modelStream = modelStringStream;
+      // Try to load with FrontEndManager
+      ov::frontend::FrontEndManager manager;
+      ov::frontend::FrontEnd::Ptr FE;
+      ov::frontend::InputModel::Ptr inputModel;
+
+      ov::AnyVector params{&modelStream, model_path};
+
+      FE = manager.load_by_model(params);
+      if (FE) {
+        inputModel = FE->load(params);
+        return FE->convert(inputModel);
+      } else {
+        ORT_THROW(log_tag + "Unknown exception while Reading network");
+      }
+    },
+                                      "Exception while Reading network");
   }
 
-  auto kv_pos = GetKVAxesPos(model);
+  OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork> & model,
+                                            std::string & hw_target,
+                                            const ov::AnyMap& device_config) {
+    ov::CompiledModel compiled_model;
+    ov::AnyMap config = device_config;
 
-  if (hw_target.find("NPU") != std::string::npos) {
-    KVDesc kv_desc;
-    auto parse_genai_config = [&](const std::string& key, unsigned int default_value) {
-      return (config.count(key) && !config.at(key).empty() && config.at(key).as<std::string>() != "0") ? config.at(key).as<unsigned int>() : default_value;
-    };
-
-    kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len);
-    kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len);
+    if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+      std::cout << "Stateless OV Model Statistic:" << std::endl;
+      LogBasicModelInfo(model);
+    }
 
-    // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0
-    if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) {
-      ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty");
+    bool model_status = IsStateful(model);
+    LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False");
+    if (!model_status) {
+      LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl;
+      PatchStatefulDecoder(model);
     }
 
     if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-      std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
-      std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
-      std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl;
-      std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl;
+      std::cout << "Stateful OV Model Statistic:" << std::endl;
+      LogBasicModelInfo(model);
     }
 
-    UpdateNPUConfig(config, kv_pos, kv_desc);
-  } else {
-    // This patches the OV IR model so that it only produces the logits required for sampling.
-    // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device,
-    // while this is here mostly to align this behavior for other devices viz. (CPU, GPU).
-    ApplySliceBeforeMatmulTransformation(model);
-  }
+    auto kv_pos = GetKVAxesPos(model);
 
-  LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow";
-  compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config);
-  OVExeNetwork exe(compiled_model, hw_target, true);
-  return exe;
-}
+    if (hw_target.find("NPU") != std::string::npos) {
+      KVDesc kv_desc;
+      auto parse_genai_config = [&](const std::string& key, unsigned int default_value) {
+        return (config.count(key) && !config.at(key).empty() && config.at(key).as<std::string>() != "0") ? config.at(key).as<unsigned int>() : default_value;
+      };
+
+      kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len);
+      kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len);
+
+      // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0
+      if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) {
+        ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty");
+      }
 
-OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
-                                  std::string& hw_target,
-                                  ov::AnyMap& device_config,
-                                  bool enable_causallm,
-                                  const std::string& name) {
-  return OvExceptionBoundary([&]() {
-    OVExeNetwork exe;
-    if (enable_causallm) {
-    auto mutable_model = ie_cnn_network->clone();
-    exe = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config);
+      if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+        std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
+        std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
+        std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl;
+        std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl;
+      }
+
+      UpdateNPUConfig(config, kv_pos, kv_desc);
     } else {
-    auto obj = core.compile_model(ie_cnn_network, hw_target, device_config);
-    exe = OVExeNetwork(obj, hw_target);
+      // This patches the OV IR model so that it only produces the logits required for sampling.
+      // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device,
+      // while this is here mostly to align this behavior for other devices viz. (CPU, GPU).
+      ApplySliceBeforeMatmulTransformation(model);
     }
 
+    LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow";
+    compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config);
+    OVExeNetwork exe(compiled_model, hw_target, true);
+    return exe;
+  }
+
+  OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork> & ie_cnn_network,
+                                    std::string & hw_target,
+                                    ov::AnyMap & device_config,
+                                    bool enable_causallm,
+                                    const std::string& name) {
+    return OvExceptionBoundary<false>([&]() {
+      OVExeNetwork exe;
+      if (enable_causallm) {
+        auto mutable_model = ie_cnn_network->clone();
+        exe = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config);
+      } else {
+        auto obj = core.compile_model(ie_cnn_network, hw_target, device_config);
+        exe = OVExeNetwork(obj, hw_target);
+      }
+
 #ifndef NDEBUG
-    printDebugInfo(exe.Get());
+      printDebugInfo(exe.Get());
 #endif
 
-    return exe;
-  },
-                             "Exception while Loading Network for graph {}", name);
-}
+      return exe;
+    },
+                                      "Exception while Loading Network for graph {}", name);
+  }
 
-OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
-                                  std::string& hw_target,
-                                  ov::AnyMap& device_config,
-                                  const std::string& name) {
-  return OvExceptionBoundary([&]() {
-    ov::CompiledModel obj;
+  OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
+                                    std::string& hw_target,
+                                    ov::AnyMap& device_config,
+                                    const std::string& name) {
+    return OvExceptionBoundary<false>([&]() {
+      ov::CompiledModel obj;
 
-    obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
+      obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
 #ifndef NDEBUG
-    printDebugInfo(obj);
+      printDebugInfo(obj);
 #endif
-    OVExeNetwork exe(obj, hw_target);
-    return exe;
-  },
-                             "Exception while Loading Network for graph {}", name);
-}
+      OVExeNetwork exe(obj, hw_target);
+      return exe;
+    },
+                                      "Exception while Loading Network for graph {}", name);
+  }
 
-OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
-                                 std::string hw_target,
-                                 const ov::AnyMap& device_config,
-                                 std::string name) {
-  return OvExceptionBoundary([&]() {
-    ov::CompiledModel obj;
-    obj = core.import_model(model_stream, hw_target, device_config);
-    OVExeNetwork exe(obj, hw_target);
+  OVExeNetwork OVCore::ImportModel(std::istream & model_stream,
+                                   std::string hw_target,
+                                   const ov::AnyMap& device_config,
+                                   std::string name) {
+    return OvExceptionBoundary<true>([&]() {
+      ov::CompiledModel obj;
+      obj = core.import_model(model_stream, hw_target, device_config);
+      OVExeNetwork exe(obj, hw_target);
 #ifndef NDEBUG
-    printDebugInfo(exe.Get());
+      printDebugInfo(exe.Get());
 #endif
-    return exe;
-  },
-                             "Exception while Loading Network for graph {}", name);
-}
+      return exe;
+    },
+                                     "Exception while Loading Network for graph {}", name);
+  }
 
-OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream,
-                                                  std::string& hw_target,
-                                                  const ov::AnyMap& device_config,
-                                                  bool enable_causallm,
-                                                  std::filesystem::path model_file_path) {
-  return OvExceptionBoundary([&]() {
-    OVExeNetwork exe;
-
-    bool isXML = backend_utils::IsModelStreamXML(model_stream);
-
-    // Helper function to check if file exists and is readable
-    const auto check_file_access = [&model_file_path](const std::filesystem::path& path) {
-      try {
-        if (!std::filesystem::exists(path) || std::filesystem::is_empty(path)) {
-          ORT_THROW(log_tag + "Required file missing or empty: " + path.string());
-        }
-        std::ifstream file(path);
-        if (!file) {
-          ORT_THROW(log_tag + "Required file not readable: " + path.string());
+  OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream & model_stream,
+                                                    std::string & hw_target,
+                                                    const ov::AnyMap& device_config,
+                                                    bool enable_causallm,
+                                                    std::filesystem::path model_file_path) {
+    return OvExceptionBoundary<false>([&]() {
+      OVExeNetwork exe;
+
+      bool isXML = backend_utils::IsModelStreamXML(model_stream);
+
+      // Helper function to check if file exists and is readable
+      const auto check_file_access = [&model_file_path](const std::filesystem::path& path) {
+        try {
+          if (!std::filesystem::exists(path) || std::filesystem::is_empty(path)) {
+            ORT_THROW(log_tag + "Required file missing or empty: " + path.string());
+          }
+          std::ifstream file(path);
+          if (!file) {
+            ORT_THROW(log_tag + "Required file not readable: " + path.string());
+          }
+        } catch (const std::exception& e) {
+          ORT_THROW(log_tag + "Exception while checking file access for: " + path.string() + " - " + e.what());
         }
-      } catch (const std::exception& e) {
-        ORT_THROW(log_tag + "Exception while checking file access for: " + path.string() + " - " + e.what());
-      }
-    };
+      };
 
-    if (isXML) {
-      // If the model is XML, we need to load it with the XML content in read_model()
-      // where weights from bin file is directly consumed
-      auto xml_file_path = model_file_path.parent_path() / (model_file_path.stem().string() + ".xml");
+      if (isXML) {
+        // If the model is XML, we need to load it with the XML content in read_model()
+        // where weights from bin file is directly consumed
+        auto xml_file_path = model_file_path.parent_path() / (model_file_path.stem().string() + ".xml");
 
-      check_file_access(xml_file_path);
+        check_file_access(xml_file_path);
 
-      LOGS_DEFAULT(INFO) << log_tag << "Reading OVIR from XML file path: " << xml_file_path.string();
+        LOGS_DEFAULT(INFO) << log_tag << "Reading OVIR from XML file path: " << xml_file_path.string();
 
-      // Load the model explicitly with XML contents
-      std::shared_ptr<ov::Model> model = core.read_model(xml_file_path.string());
+        // Load the model explicitly with XML contents
+        std::shared_ptr<ov::Model> model = core.read_model(xml_file_path.string());
 
-      if (enable_causallm) {
-        exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config);
-      } else {
-        auto obj = core.compile_model(model, hw_target, device_config);
-        exe = OVExeNetwork(obj, hw_target);
+        if (enable_causallm) {
+          exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config);
+        } else {
+          auto obj = core.compile_model(model, hw_target, device_config);
+          exe = OVExeNetwork(obj, hw_target);
+        }
       }
-    }
 
 #ifndef NDEBUG
-    printDebugInfo(exe.Get());
+      printDebugInfo(exe.Get());
 #endif
-    return exe;
-  },
-                             "Exception while Loading Network from OVIR model file: {}", model_file_path.string());
-}
-
-
-void OVCore::SetCache(const std::string& cache_dir_path) {
-  core.set_property(ov::cache_dir(cache_dir_path));
-}
-
-std::vector<std::string> OVCore::GetAvailableDevices() const {
-  std::vector<std::string> available_devices = core.get_available_devices();
-  return available_devices;
-}
-
-std::vector<std::string> OVCore::GetAvailableDevices(const std::string& device_type) const {
-  std::vector<std::string> available_devices;
-  std::vector<std::string> devicesIDs;
-  // Uses logic from OpenVINO to only return available devices of the specified type (e.g. CPU, NPU or GPU)
-  try {
-    devicesIDs = core.get_property(device_type, ov::available_devices);
-  } catch (const ov::Exception&) {
-    // plugin is not created by e.g. invalid env
-    // Empty device list will be returned
-  } catch (const std::exception& ex) {
-    ORT_THROW(log_tag + "An exception occurred while trying to create the ",
-              device_type,
-              " device: ",
-              ex.what());
-  } catch (...) {
-    ORT_THROW(log_tag + "Unknown exception occurred while trying to create the ",
-              device_type,
-              " device");
+      return exe;
+    },
+                                      "Exception while Loading Network from OVIR model file: {}", model_file_path.string());
   }
 
-  if (devicesIDs.size() > 1 ||
-      (devicesIDs.size() == 1 && devicesIDs[0] == "0")) {
-    for (const auto& deviceID : devicesIDs) {
-      available_devices.push_back(device_type + '.' + deviceID);
-    }
+  void OVCore::SetCache(const std::string& cache_dir_path) {
+    core.set_property(ov::cache_dir(cache_dir_path));
   }
-  if (!devicesIDs.empty()) {
-    available_devices.push_back(device_type);
-  }
-
-  return available_devices;
-}
 
-void OVCore::SetStreams(const std::string& device_type, int num_streams) {
-  core.set_property(device_type, {ov::num_streams(num_streams)});
-}
+  std::vector<std::string> OVCore::GetAvailableDevices() const {
+    std::vector<std::string> available_devices = core.get_available_devices();
+    return available_devices;
+  }
 
-std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
-   return OvExceptionBoundary([&]() {
-    auto infReq = compiled_model_obj.create_infer_request();
-    std::shared_ptr<OVInferRequest> ovInfReq;
-    if (is_stateful_causallm) {
-      ovInfReq = std::make_shared<StatefulOVInferRequest>(std::move(infReq), target_device);
-    } else {
-      ovInfReq = std::make_shared<OVInferRequest>(std::move(infReq));
+  std::vector<std::string> OVCore::GetAvailableDevices(const std::string& device_type) const {
+    std::vector<std::string> available_devices;
+    std::vector<std::string> devicesIDs;
+    // Uses logic from OpenVINO to only return available devices of the specified type (e.g. CPU, NPU or GPU)
+    try {
+      devicesIDs = core.get_property(device_type, ov::available_devices);
+    } catch (const ov::Exception&) {
+      // plugin is not created by e.g. invalid env
+      // Empty device list will be returned
+    } catch (const std::exception& ex) {
+      ORT_THROW(log_tag + "An exception occurred while trying to create the ",
+                device_type,
+                " device: ",
+                ex.what());
+    } catch (...) {
+      ORT_THROW(log_tag + "Unknown exception occurred while trying to create the ",
+                device_type,
+                " device");
     }
-    return ovInfReq;
-  },
-
-                             "Exception while creating InferRequest object");
-}
 
-OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
-  return OvExceptionBoundary([&]() {
-    auto tobj = ovInfReq.get_tensor(input_name);
-    OVTensorPtr blob = std::make_shared<OVTensor>(tobj);
-    return blob;
-  },
-                             " Cannot access IE Blob for input: {}", input_name);
-}
+    if (devicesIDs.size() > 1 ||
+        (devicesIDs.size() == 1 && devicesIDs[0] == "0")) {
+      for (const auto& deviceID : devicesIDs) {
+        available_devices.push_back(device_type + '.' + deviceID);
+      }
+    }
+    if (!devicesIDs.empty()) {
+      available_devices.push_back(device_type);
+    }
 
-std::string OVInferRequest::GetInputTensorName(uint32_t index) {
-  return OvExceptionBoundary([&]() {
-    const auto& model = ovInfReq.get_compiled_model();
-    return *model.input(index).get_names().begin();
-  },
-                             " Cannot access IE Blob for input number: {}", index);
-}
+    return available_devices;
+  }
 
-void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
-  OvExceptionBoundary([&]() {
-    ovInfReq.set_tensor(name, *(blob.get()));
-  },
-                      " Cannot set Remote Blob for output: {}", name);
-}
+  void OVCore::SetStreams(const std::string& device_type, int num_streams) {
+    core.set_property(device_type, {ov::num_streams(num_streams)});
+  }
 
-uint32_t OVInferRequest::GetNumInputs() {
-  return static_cast<uint32_t>(ovInfReq.get_compiled_model().inputs().size());
-}
+  std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
+    return OvExceptionBoundary<false>([&]() {
+      auto infReq = compiled_model_obj.create_infer_request();
+      std::shared_ptr<OVInferRequest> ovInfReq;
+      if (is_stateful_causallm) {
+        ovInfReq = std::make_shared<StatefulOVInferRequest>(std::move(infReq), target_device);
+      } else {
+        ovInfReq = std::make_shared<OVInferRequest>(std::move(infReq));
+      }
+      return ovInfReq;
+    },
 
-void OVInferRequest::Infer() {
-  OvExceptionBoundary([&]() {
-    ovInfReq.infer();
-  },
-                      "In Error Couldn't start Inference");
-}
+                                      "Exception while creating InferRequest object");
+  }
 
-StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)
-    : OVInferRequest(std::move(infer_request)), target_device(device) {
-  bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos));
-  if (gpu_or_npu) {
-    prefill_use_full_chat_history = true;
+  OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
+    return OvExceptionBoundary<false>([&]() {
+      auto tobj = ovInfReq.get_tensor(input_name);
+      OVTensorPtr blob = std::make_shared<OVTensor>(tobj);
+      return blob;
+    },
+                                      " Cannot access IE Blob for input: {}", input_name);
   }
-}
 
-void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type,
-                                        const std::vector<size_t>& shape, int32_t fill_value) {
-  ov::Tensor tensor = ov::Tensor(type, shape);
-  std::fill_n(tensor.data<int32_t>(), tensor.get_size(), fill_value);
-  ovInfReq.set_tensor(tensor_name, tensor);
-}
+  std::string OVInferRequest::GetInputTensorName(uint32_t index) {
+    return OvExceptionBoundary<false>([&]() {
+      const auto& model = ovInfReq.get_compiled_model();
+      return *model.input(index).get_names().begin();
+    },
+                                      " Cannot access IE Blob for input number: {}", index);
+  }
 
-void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache) {
-  auto tensor = ovInfReq.get_tensor(tensor_name);
-  auto* pData = tensor.data<int64_t>();
-  for (size_t i = 0; i < tensor.get_size(); i++) {
-    cache.emplace_back(pData[i]);
+  void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
+    OvExceptionBoundary<false>([&]() {
+      ovInfReq.set_tensor(name, *(blob.get()));
+    },
+                               " Cannot set Remote Blob for output: {}", name);
   }
-}
 
-void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name,
-                                                const std::vector<int64_t>& cache_data) {
-  auto tensor = ovInfReq.get_tensor(tensor_name);
-  auto new_shape = tensor.get_shape();
-  new_shape[1] = cache_data.size();
+  uint32_t OVInferRequest::GetNumInputs() {
+    return static_cast<uint32_t>(ovInfReq.get_compiled_model().inputs().size());
+  }
 
-  auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape);
-  auto* pNewData = new_tensor.data<int64_t>();
-  std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t));
+  void OVInferRequest::Infer() {
+    OvExceptionBoundary<false>([&]() {
+      ovInfReq.infer();
+    },
+                               "In Error Couldn't start Inference");
+  }
 
-  ovInfReq.set_tensor(tensor_name, new_tensor);
-}
+  StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)
+      : OVInferRequest(std::move(infer_request)), target_device(device) {
+    bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos));
+    if (gpu_or_npu) {
+      prefill_use_full_chat_history = true;
+    }
+  }
 
-std::optional<ov::Tensor> StatefulOVInferRequest::FindTensor(const std::string& tensor_name) {
-  // Check if tensor exists by examining input names in the compiled model
-  const auto& model = ovInfReq.get_compiled_model();
-  bool tensor_exists = false;
+  void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type,
+                                          const std::vector<size_t>& shape, int32_t fill_value) {
+    ov::Tensor tensor = ov::Tensor(type, shape);
+    std::fill_n(tensor.data<int32_t>(), tensor.get_size(), fill_value);
+    ovInfReq.set_tensor(tensor_name, tensor);
+  }
 
-  for (const auto& input : model.inputs()) {
-    const auto& names = input.get_names();
-    if (names.find(tensor_name) != names.end()) {
-      tensor_exists = true;
-      break;
+  void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache) {
+    auto tensor = ovInfReq.get_tensor(tensor_name);
+    auto* pData = tensor.data<int64_t>();
+    for (size_t i = 0; i < tensor.get_size(); i++) {
+      cache.emplace_back(pData[i]);
     }
   }
 
-  if (tensor_exists) {
-    return ovInfReq.get_tensor(tensor_name);
-  }
+  void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name,
+                                                  const std::vector<int64_t>& cache_data) {
+    auto tensor = ovInfReq.get_tensor(tensor_name);
+    auto new_shape = tensor.get_shape();
+    new_shape[1] = cache_data.size();
 
-  return std::nullopt;
-}
+    auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape);
+    auto* pNewData = new_tensor.data<int64_t>();
+    std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t));
 
-void StatefulOVInferRequest::PreProcessInferRequest() {
-  // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
-  // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
-  FillTensor("beam_idx", ov::element::i32, {1}, 0);
+    ovInfReq.set_tensor(tensor_name, new_tensor);
+  }
 
-  // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids.
-  if (prefill_use_full_chat_history) {
-    auto input_ids_tensor = ovInfReq.get_tensor("input_ids");
-    CacheTensor("input_ids", cached_input_ids);
+  std::optional<ov::Tensor> StatefulOVInferRequest::FindTensor(const std::string& tensor_name) {
+    // Check if tensor exists by examining input names in the compiled model
+    const auto& model = ovInfReq.get_compiled_model();
+    bool tensor_exists = false;
 
-    // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists
-    auto position_ids_opt = FindTensor("position_ids");
-    bool has_position_ids = position_ids_opt.has_value();
+    for (const auto& input : model.inputs()) {
+      const auto& names = input.get_names();
+      if (names.find(tensor_name) != names.end()) {
+        tensor_exists = true;
+        break;
+      }
+    }
 
-    if (has_position_ids) {
-      CacheTensor("position_ids", cached_position_ids);
+    if (tensor_exists) {
+      return ovInfReq.get_tensor(tensor_name);
     }
 
-    // If we're about to run the prefill model
-    if (input_ids_tensor.get_size() > 1) {
-      // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids".
-      // This indicates that we are running a subsequent prompt (not the initial prefill).
-      if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) {
-        // Clear the internal KVCache state. For NPU device, this operation is a no-op.
-        ovInfReq.reset_state();
+    return std::nullopt;
+  }
+
+  void StatefulOVInferRequest::PreProcessInferRequest() {
+    // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
+    // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
+    FillTensor("beam_idx", ov::element::i32, {1}, 0);
 
-        // Set tensors using cached values
-        SetTensorFromCache("input_ids", cached_input_ids);
+    // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids.
+    if (prefill_use_full_chat_history) {
+      auto input_ids_tensor = ovInfReq.get_tensor("input_ids");
+      CacheTensor("input_ids", cached_input_ids);
 
-        // Only set position_ids if it exists and we have cached values
-        if (has_position_ids && !cached_position_ids.empty()) {
-          SetTensorFromCache("position_ids", cached_position_ids);
+      // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists
+      auto position_ids_opt = FindTensor("position_ids");
+      bool has_position_ids = position_ids_opt.has_value();
+
+      if (has_position_ids) {
+        CacheTensor("position_ids", cached_position_ids);
+      }
+
+      // If we're about to run the prefill model
+      if (input_ids_tensor.get_size() > 1) {
+        // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids".
+        // This indicates that we are running a subsequent prompt (not the initial prefill).
+        if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) {
+          // Clear the internal KVCache state. For NPU device, this operation is a no-op.
+          ovInfReq.reset_state();
+
+          // Set tensors using cached values
+          SetTensorFromCache("input_ids", cached_input_ids);
+
+          // Only set position_ids if it exists and we have cached values
+          if (has_position_ids && !cached_position_ids.empty()) {
+            SetTensorFromCache("position_ids", cached_position_ids);
+          }
         }
       }
     }
   }
-}
 
-void StatefulOVInferRequest::Infer() {
-  PreProcessInferRequest();
-  OVInferRequest::Infer();
-}
+  void StatefulOVInferRequest::Infer() {
+    PreProcessInferRequest();
+    OVInferRequest::Infer();
+  }
 
-void StatefulOVInferRequest::RewindKVCache(size_t index) {
-  LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index;
+  void StatefulOVInferRequest::RewindKVCache(size_t index) {
+    LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index;
 
-  if (prefill_use_full_chat_history) {
-    // Clear the internal KVCache state. For NPU device, this operation is a no-op.
-    ovInfReq.reset_state();
+    if (prefill_use_full_chat_history) {
+      // Clear the internal KVCache state. For NPU device, this operation is a no-op.
+      ovInfReq.reset_state();
 
-    // Resize the cached "input_ids" and "position_ids" to the specified index.
-    if (cached_input_ids.size() > index) {
-      cached_input_ids.resize(index);
-    }
+      // Resize the cached "input_ids" and "position_ids" to the specified index.
+      if (cached_input_ids.size() > index) {
+        cached_input_ids.resize(index);
+      }
 
-    if (cached_position_ids.size() > index) {
-      cached_position_ids.resize(index);
-    }
-  } else {
-    if (index == 0) {
-      // In this case, since we're resetting the entire KVCache, simply reset the state.
-      ovInfReq.reset_state();
+      if (cached_position_ids.size() > index) {
+        cached_position_ids.resize(index);
+      }
     } else {
-      // Retrieve KVCache states and trim them to the specified index.
-      // The following logic is adapted from:
-      // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329
-      auto states = ovInfReq.query_state();
-      for (auto& state : states) {
-        ov::Tensor old_tensor = state.get_state();
-        // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size]
-        auto shape = old_tensor.get_shape();
-
-        if (shape[2] > index) {
-          // Update the sequence length dimension to the specified index.
-          shape[2] = index;
-
-          ov::Coordinate new_shape_begin{0, 0, 0, 0};
-          ov::Coordinate new_shape_end{shape};
-
-          // Create a trimmed tensor with the updated shape.
-          auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end);
-
-          // Copy the trimmed tensor into a new tensor and update the state.
-          ov::Tensor new_tensor(old_tensor.get_element_type(), shape);
-          trimmed_tensor.copy_to(new_tensor);
-
-          state.set_state(new_tensor);
+      if (index == 0) {
+        // In this case, since we're resetting the entire KVCache, simply reset the state.
+        ovInfReq.reset_state();
+      } else {
+        // Retrieve KVCache states and trim them to the specified index.
+        // The following logic is adapted from:
+        // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329
+        auto states = ovInfReq.query_state();
+        for (auto& state : states) {
+          ov::Tensor old_tensor = state.get_state();
+          // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size]
+          auto shape = old_tensor.get_shape();
+
+          if (shape[2] > index) {
+            // Update the sequence length dimension to the specified index.
+            shape[2] = index;
+
+            ov::Coordinate new_shape_begin{0, 0, 0, 0};
+            ov::Coordinate new_shape_end{shape};
+
+            // Create a trimmed tensor with the updated shape.
+            auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end);
+
+            // Copy the trimmed tensor into a new tensor and update the state.
+            ov::Tensor new_tensor(old_tensor.get_element_type(), shape);
+            trimmed_tensor.copy_to(new_tensor);
+
+            state.set_state(new_tensor);
+          }
         }
       }
     }
   }
-}
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 45ea822685710..88ddde8610c6e 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -38,7 +38,7 @@ GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler,
     device_type_ = "CPU";
     if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true;
   } else if (enable_qdq_optimizer && device_type_.find("GPU") != std::string::npos) {
-    npu_qdq_optimizer_enabled = true;   // see data_ops.cc ~615 where we check for int16 types for gpu, this may change to a better approach later
+    npu_qdq_optimizer_enabled = true;  // see data_ops.cc ~615 where we check for int16 types for gpu, this may change to a better approach later
   }
 
 #if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index b88f0d04d21f2..3dbf457c94242 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -614,7 +614,7 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) {
         }
         // experimentally for GPU and qdq stripping mode allow int16 types
         if (npu_qdq_optimizer_enabled_ && (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16))
-            return true;
+          return true;
       }
 #ifndef NDEBUG
       if (openvino_ep::backend_utils::IsDebugEnabled()) {