openvinotoolkit · wangleis · Nov 24, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
@@ -19,6 +19,7 @@
 #include "openvino/core/any.hpp"
 #include "openvino/core/except.hpp"
 #include "openvino/core/model.hpp"
+#include "openvino/core/type.hpp"
 #include "openvino/runtime/intel_cpu/properties.hpp"
 #include "openvino/runtime/properties.hpp"
 #include "openvino/runtime/system_conf.hpp"
@@ -33,8 +34,10 @@
 #endif
 #include "cpu_map_scheduling.hpp"
 #include "openvino/op/fake_quantize.hpp"
+#include "openvino/op/paged_attention.hpp"
 #include "openvino/runtime/threading/cpu_streams_info.hpp"
 #include "openvino/runtime/threading/istreams_executor.hpp"
+#include "transformations/cpu_opset/common/op/sdpa.hpp"
 #include "transformations/utils/utils.hpp"
 #include "utils/general_utils.h"
 
@@ -609,20 +612,28 @@ int get_model_prefer_threads(const int num_streams,
                              const std::shared_ptr<ov::Model>& model,
                              Config& config) {
     bool int8_intensive = ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model);
+    auto is_paged_attention_model = false;
+    const auto is_LLM =
+        ov::op::util::is_large_language_model(*model, [&is_paged_attention_model](std::shared_ptr<ov::Node> node) {
+            if (ov::is_type<ov::op::PagedAttentionExtension>(node)) {
+                is_paged_attention_model = true;
+                return true;
+            } else if (ov::is_type<ov::intel_cpu::ScaledDotProductAttentionWithKVCache>(node)) {
+                return true;
+            }
+
+            return false;
+        });
 
     auto default_prefer_threads_latency = [&]() {
-        bool llm_related = ov::op::util::is_large_language_model(*model);
         const int int8_threshold = 4;  // ~relative efficiency of the VNNI-intensive code for Big vs Little cores;
         const int fp32_threshold = 2;  // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores;
-        // By default the latency case uses (faster) Big cores only, depending on the compute ratio
-        // But on MTL detected by ov::get_number_of_blocked_cores(), use Big and Little cores together in Big
-        // cores only cases except LLM.
+
         bool use_all_cores =
             proc_type_table[0][MAIN_CORE_PROC] <= (proc_type_table[0][EFFICIENT_CORE_PROC] /
-                                                   (int8_intensive || llm_related ? int8_threshold : fp32_threshold));
-        bool use_big_and_little = !llm_related && (ov::get_number_of_blocked_cores() != 0);
+                                                   (int8_intensive ? int8_threshold : fp32_threshold));
 
-        if (use_all_cores || use_big_and_little) {
+        if (use_all_cores && !is_LLM) {
             config.modelPreferThreadsLatency =
                 proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC];
         } else {
@@ -720,7 +731,7 @@ int get_model_prefer_threads(const int num_streams,
             if ((proc_type_table[0][MAIN_CORE_PROC] < config.threads || config.threads == 0) &&
                 (ov::get_number_of_blocked_cores() || proc_type_table[0][LP_EFFICIENT_CORE_PROC] > 0) &&
                 proc_type_table[0][EFFICIENT_CORE_PROC] <= 2 * proc_type_table[0][MAIN_CORE_PROC]) {
-                if (ov::op::util::is_large_language_model(*model)) {
+                if (is_LLM) {
                     config.modelPreferThreadsLatency = proc_type_table[0][MAIN_CORE_PROC];
                 } else {
                     config.modelPreferThreadsLatency =