Skip to content

Commit bdee912

Browse files
committed
Refine setup_pyramid_infer_requests.
Signed-off-by: intelgaoxiong <[email protected]>
1 parent aa85aca commit bdee912

File tree

3 files changed

+45
-55
lines changed

3 files changed

+45
-55
lines changed

src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp

Lines changed: 43 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -990,96 +990,86 @@ void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) {
990990
}
991991

992992
void ov::npuw::JustInferRequest::setup_pyramid_infer_requests(std::size_t real_idx, bool is_piped, bool is_recreate) {
993-
auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
994-
995-
if (!proto_comp_model_desc.pyramid_attention.has_value()) {
993+
auto& submodel_desc = m_npuw_model->m_compiled_submodels[real_idx];
994+
if (!submodel_desc.pyramid_attention.has_value()) {
996995
return;
997996
}
998997

999998
LOG_INFO((is_recreate ? "Recreating" : "Creating") << " pyramid infer requests...");
1000999
LOG_BLOCK();
10011000

1002-
const auto& pyramid_attention = proto_comp_model_desc.pyramid_attention.value();
1003-
const auto& compiled_models = pyramid_attention._compiled_models;
1001+
const auto& pyramid_models = submodel_desc.pyramid_attention.value()._compiled_models;
1002+
const size_t num_pyramid_models = pyramid_models.size();
10041003

10051004
// Clear existing requests if recreating
10061005
if (is_recreate) {
1007-
proto_comp_model_desc.pyramid_infer_requests.clear();
1008-
proto_comp_model_desc.pyramid_pipeline_requests.clear();
1006+
submodel_desc.pyramid_infer_requests.clear();
1007+
submodel_desc.pyramid_pipeline_requests.clear();
10091008
}
10101009

1011-
// Initialize pyramid infer requests storage
1012-
proto_comp_model_desc.pyramid_infer_requests.resize(compiled_models.size());
1010+
// Allocate storage for infer requests
1011+
submodel_desc.pyramid_infer_requests.resize(num_pyramid_models);
10131012
if (is_piped) {
1014-
proto_comp_model_desc.pyramid_pipeline_requests.resize(compiled_models.size());
1013+
submodel_desc.pyramid_pipeline_requests.resize(num_pyramid_models);
10151014
}
10161015

1017-
// Create infer requests for all pyramid models except the last one
1018-
// The last pyramid model uses the original model's infer requests
1019-
for (size_t model_id = 0; model_id < compiled_models.size() - 1; ++model_id) {
1016+
// Create infer requests for all but the last pyramid model
1017+
for (size_t model_idx = 0; model_idx + 1 < num_pyramid_models; ++model_idx) {
10201018
try {
1021-
// Create primary infer request
1022-
auto pyramid_request = compiled_models[model_id]->create_infer_request();
1023-
proto_comp_model_desc.pyramid_infer_requests[model_id] = pyramid_request;
1024-
1025-
// Create pipeline infer request if needed
1019+
// Create main infer request
1020+
submodel_desc.pyramid_infer_requests[model_idx] = pyramid_models[model_idx]->create_infer_request();
1021+
// Create pipeline infer request if pipelined
10261022
if (is_piped) {
1027-
auto pyramid_pipeline_request = compiled_models[model_id]->create_infer_request();
1028-
proto_comp_model_desc.pyramid_pipeline_requests[model_id] = pyramid_pipeline_request;
1023+
submodel_desc.pyramid_pipeline_requests[model_idx] = pyramid_models[model_idx]->create_infer_request();
10291024
}
1030-
10311025
} catch (const std::exception& ex) {
10321026
LOG_ERROR("Failed to " << (is_recreate ? "recreate" : "create") << " infer request for pyramid model["
1033-
<< model_id << "]: " << ex.what());
1027+
<< model_idx << "]: " << ex.what());
10341028
NPUW_ASSERT(false && "Pyramid model infer request creation/recreation failed");
10351029
} catch (...) {
10361030
LOG_ERROR("Failed to " << (is_recreate ? "recreate" : "create") << " infer request for pyramid model["
1037-
<< model_id << "]: Unknown error");
1031+
<< model_idx << "]: Unknown error");
10381032
NPUW_ASSERT(false && "Pyramid model infer request creation/recreation failed with unknown error");
10391033
}
10401034

1041-
// Setup tensor sharing for past_key_values inputs
1042-
for (auto input : compiled_models[model_id]->inputs()) {
1043-
if (input.get_names().empty()) {
1044-
continue;
1045-
}
1046-
1047-
auto input_name = input.get_any_name();
1048-
if (!ov::npuw::util::isPastKeyValuesKey(input_name) && !ov::npuw::util::isPastKeyValuesValue(input_name)) {
1049-
continue;
1050-
}
1051-
1052-
// Setup primary infer request tensor
1053-
auto tensor = proto_comp_model_desc.pyramid_infer_requests[model_id]->get_tensor(input);
1054-
auto primary_ptr = m_subrequests[real_idx]->get_tensor(input)->data();
1055-
auto new_tensor =
1056-
ov::get_tensor_impl(ov::Tensor(tensor->get_element_type(), tensor->get_shape(), primary_ptr));
1057-
proto_comp_model_desc.pyramid_infer_requests[model_id]->set_tensor(input, new_tensor);
1058-
1059-
// Setup pipeline infer request tensor if needed
1035+
// Share input tensors between pyramid and main infer requests
1036+
const size_t num_inputs = pyramid_models[model_idx]->inputs().size();
1037+
NPUW_ASSERT(num_inputs == submodel_desc.compiled_model->inputs().size());
1038+
for (size_t input_idx = 0; input_idx < num_inputs; ++input_idx) {
1039+
auto pyramid_input = pyramid_models[model_idx]->inputs()[input_idx];
1040+
auto main_input = submodel_desc.compiled_model->inputs()[input_idx];
1041+
1042+
// Get tensor from main infer request and share its memory with the pyramid infer request
1043+
auto main_tensor_ptr = m_subrequests[real_idx]->get_tensor(main_input)->data();
1044+
auto pyramid_tensor = submodel_desc.pyramid_infer_requests[model_idx]->get_tensor(pyramid_input);
1045+
auto shared_tensor = ov::get_tensor_impl(
1046+
ov::Tensor(pyramid_tensor->get_element_type(), pyramid_tensor->get_shape(), main_tensor_ptr));
1047+
submodel_desc.pyramid_infer_requests[model_idx]->set_tensor(pyramid_input, shared_tensor);
1048+
1049+
// Repeat for pipeline infer request if pipelined
10601050
if (is_piped) {
1061-
auto pipeline_tensor = proto_comp_model_desc.pyramid_pipeline_requests[model_id]->get_tensor(input);
1062-
auto pipeline_tensor_ptr = m_funcall_pipeline[real_idx].subrequest->get_tensor(input)->data();
1063-
auto pipeline_new_tensor = ov::get_tensor_impl(
1051+
auto pipeline_tensor = submodel_desc.pyramid_pipeline_requests[model_idx]->get_tensor(pyramid_input);
1052+
auto pipeline_tensor_ptr = m_funcall_pipeline[real_idx].subrequest->get_tensor(main_input)->data();
1053+
auto shared_pipeline_tensor = ov::get_tensor_impl(
10641054
ov::Tensor(pipeline_tensor->get_element_type(), pipeline_tensor->get_shape(), pipeline_tensor_ptr));
1065-
proto_comp_model_desc.pyramid_pipeline_requests[model_id]->set_tensor(input, pipeline_new_tensor);
1055+
submodel_desc.pyramid_pipeline_requests[model_idx]->set_tensor(pyramid_input, shared_pipeline_tensor);
10661056
}
10671057
}
10681058
}
10691059

10701060
// For the last pyramid model, reuse the original model's infer requests
1071-
if (compiled_models.size() > 0) {
1072-
const size_t last_model_id = compiled_models.size() - 1;
1061+
if (num_pyramid_models > 0) {
1062+
const size_t last_model_idx = num_pyramid_models - 1;
10731063
LOG_INFO("Reusing " << (is_recreate ? "recreated " : "") << "original infer requests for last pyramid model["
1074-
<< last_model_id << "]");
1075-
proto_comp_model_desc.pyramid_infer_requests[last_model_id] = m_subrequests[real_idx];
1064+
<< last_model_idx << "]");
1065+
submodel_desc.pyramid_infer_requests[last_model_idx] = m_subrequests[real_idx];
10761066
if (is_piped) {
1077-
proto_comp_model_desc.pyramid_pipeline_requests[last_model_id] = m_funcall_pipeline[real_idx].subrequest;
1067+
submodel_desc.pyramid_pipeline_requests[last_model_idx] = m_funcall_pipeline[real_idx].subrequest;
10781068
}
10791069
}
10801070

1081-
if (!is_recreate && compiled_models.size() > 0) {
1082-
LOG_INFO("Successfully created " << (compiled_models.size() - 1)
1071+
if (!is_recreate && num_pyramid_models > 0) {
1072+
LOG_INFO("Successfully created " << (num_pyramid_models - 1)
10831073
<< " new pyramid infer requests and reused 1 original request");
10841074
}
10851075
}

src/plugins/intel_npu/src/plugin/npuw/pyramid_attention.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -637,4 +637,4 @@ int64_t PositionIDs::past_length() const {
637637
} // namespace pyramid_attention
638638
} // namespace runtime
639639
} // namespace npuw
640-
} // namespace ov
640+
} // namespace ov

src/plugins/intel_npu/src/plugin/npuw/pyramid_attention.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,4 +273,4 @@ class PositionIDs final : public Selector {
273273
} // namespace runtime
274274

275275
} // namespace npuw
276-
} // namespace ov
276+
} // namespace ov

0 commit comments

Comments
 (0)