@@ -990,96 +990,86 @@ void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) {
990990}
991991
992992void ov::npuw::JustInferRequest::setup_pyramid_infer_requests (std::size_t real_idx, bool is_piped, bool is_recreate) {
993- auto & proto_comp_model_desc = m_npuw_model->m_compiled_submodels [real_idx];
994-
995- if (!proto_comp_model_desc.pyramid_attention .has_value ()) {
993+ auto & submodel_desc = m_npuw_model->m_compiled_submodels [real_idx];
994+ if (!submodel_desc.pyramid_attention .has_value ()) {
996995 return ;
997996 }
998997
999998 LOG_INFO ((is_recreate ? " Recreating" : " Creating" ) << " pyramid infer requests..." );
1000999 LOG_BLOCK ();
10011000
1002- const auto & pyramid_attention = proto_comp_model_desc .pyramid_attention .value ();
1003- const auto & compiled_models = pyramid_attention. _compiled_models ;
1001+ const auto & pyramid_models = submodel_desc .pyramid_attention .value (). _compiled_models ;
1002+ const size_t num_pyramid_models = pyramid_models. size () ;
10041003
10051004 // Clear existing requests if recreating
10061005 if (is_recreate) {
1007- proto_comp_model_desc .pyramid_infer_requests .clear ();
1008- proto_comp_model_desc .pyramid_pipeline_requests .clear ();
1006+ submodel_desc .pyramid_infer_requests .clear ();
1007+ submodel_desc .pyramid_pipeline_requests .clear ();
10091008 }
10101009
1011- // Initialize pyramid infer requests storage
1012- proto_comp_model_desc .pyramid_infer_requests .resize (compiled_models. size () );
1010+ // Allocate storage for infer requests
1011+ submodel_desc .pyramid_infer_requests .resize (num_pyramid_models );
10131012 if (is_piped) {
1014- proto_comp_model_desc .pyramid_pipeline_requests .resize (compiled_models. size () );
1013+ submodel_desc .pyramid_pipeline_requests .resize (num_pyramid_models );
10151014 }
10161015
1017- // Create infer requests for all pyramid models except the last one
1018- // The last pyramid model uses the original model's infer requests
1019- for (size_t model_id = 0 ; model_id < compiled_models.size () - 1 ; ++model_id) {
1016+ // Create infer requests for all but the last pyramid model
1017+ for (size_t model_idx = 0 ; model_idx + 1 < num_pyramid_models; ++model_idx) {
10201018 try {
1021- // Create primary infer request
1022- auto pyramid_request = compiled_models[model_id]->create_infer_request ();
1023- proto_comp_model_desc.pyramid_infer_requests [model_id] = pyramid_request;
1024-
1025- // Create pipeline infer request if needed
1019+ // Create main infer request
1020+ submodel_desc.pyramid_infer_requests [model_idx] = pyramid_models[model_idx]->create_infer_request ();
1021+ // Create pipeline infer request if pipelined
10261022 if (is_piped) {
1027- auto pyramid_pipeline_request = compiled_models[model_id]->create_infer_request ();
1028- proto_comp_model_desc.pyramid_pipeline_requests [model_id] = pyramid_pipeline_request;
1023+ submodel_desc.pyramid_pipeline_requests [model_idx] = pyramid_models[model_idx]->create_infer_request ();
10291024 }
1030-
10311025 } catch (const std::exception& ex) {
10321026 LOG_ERROR (" Failed to " << (is_recreate ? " recreate" : " create" ) << " infer request for pyramid model["
1033- << model_id << " ]: " << ex.what ());
1027+ << model_idx << " ]: " << ex.what ());
10341028 NPUW_ASSERT (false && " Pyramid model infer request creation/recreation failed" );
10351029 } catch (...) {
10361030 LOG_ERROR (" Failed to " << (is_recreate ? " recreate" : " create" ) << " infer request for pyramid model["
1037- << model_id << " ]: Unknown error" );
1031+ << model_idx << " ]: Unknown error" );
10381032 NPUW_ASSERT (false && " Pyramid model infer request creation/recreation failed with unknown error" );
10391033 }
10401034
1041- // Setup tensor sharing for past_key_values inputs
1042- for (auto input : compiled_models[model_id]->inputs ()) {
1043- if (input.get_names ().empty ()) {
1044- continue ;
1045- }
1046-
1047- auto input_name = input.get_any_name ();
1048- if (!ov::npuw::util::isPastKeyValuesKey (input_name) && !ov::npuw::util::isPastKeyValuesValue (input_name)) {
1049- continue ;
1050- }
1051-
1052- // Setup primary infer request tensor
1053- auto tensor = proto_comp_model_desc.pyramid_infer_requests [model_id]->get_tensor (input);
1054- auto primary_ptr = m_subrequests[real_idx]->get_tensor (input)->data ();
1055- auto new_tensor =
1056- ov::get_tensor_impl (ov::Tensor (tensor->get_element_type (), tensor->get_shape (), primary_ptr));
1057- proto_comp_model_desc.pyramid_infer_requests [model_id]->set_tensor (input, new_tensor);
1058-
1059- // Setup pipeline infer request tensor if needed
1035+ // Share input tensors between pyramid and main infer requests
1036+ const size_t num_inputs = pyramid_models[model_idx]->inputs ().size ();
1037+ NPUW_ASSERT (num_inputs == submodel_desc.compiled_model ->inputs ().size ());
1038+ for (size_t input_idx = 0 ; input_idx < num_inputs; ++input_idx) {
1039+ auto pyramid_input = pyramid_models[model_idx]->inputs ()[input_idx];
1040+ auto main_input = submodel_desc.compiled_model ->inputs ()[input_idx];
1041+
1042+ // Get tensor from main infer request and share its memory with the pyramid infer request
1043+ auto main_tensor_ptr = m_subrequests[real_idx]->get_tensor (main_input)->data ();
1044+ auto pyramid_tensor = submodel_desc.pyramid_infer_requests [model_idx]->get_tensor (pyramid_input);
1045+ auto shared_tensor = ov::get_tensor_impl (
1046+ ov::Tensor (pyramid_tensor->get_element_type (), pyramid_tensor->get_shape (), main_tensor_ptr));
1047+ submodel_desc.pyramid_infer_requests [model_idx]->set_tensor (pyramid_input, shared_tensor);
1048+
1049+ // Repeat for pipeline infer request if pipelined
10601050 if (is_piped) {
1061- auto pipeline_tensor = proto_comp_model_desc .pyramid_pipeline_requests [model_id ]->get_tensor (input );
1062- auto pipeline_tensor_ptr = m_funcall_pipeline[real_idx].subrequest ->get_tensor (input )->data ();
1063- auto pipeline_new_tensor = ov::get_tensor_impl (
1051+ auto pipeline_tensor = submodel_desc .pyramid_pipeline_requests [model_idx ]->get_tensor (pyramid_input );
1052+ auto pipeline_tensor_ptr = m_funcall_pipeline[real_idx].subrequest ->get_tensor (main_input )->data ();
1053+ auto shared_pipeline_tensor = ov::get_tensor_impl (
10641054 ov::Tensor (pipeline_tensor->get_element_type (), pipeline_tensor->get_shape (), pipeline_tensor_ptr));
1065- proto_comp_model_desc .pyramid_pipeline_requests [model_id ]->set_tensor (input, pipeline_new_tensor );
1055+ submodel_desc .pyramid_pipeline_requests [model_idx ]->set_tensor (pyramid_input, shared_pipeline_tensor );
10661056 }
10671057 }
10681058 }
10691059
10701060 // For the last pyramid model, reuse the original model's infer requests
1071- if (compiled_models. size () > 0 ) {
1072- const size_t last_model_id = compiled_models. size () - 1 ;
1061+ if (num_pyramid_models > 0 ) {
1062+ const size_t last_model_idx = num_pyramid_models - 1 ;
10731063 LOG_INFO (" Reusing " << (is_recreate ? " recreated " : " " ) << " original infer requests for last pyramid model["
1074- << last_model_id << " ]" );
1075- proto_comp_model_desc .pyramid_infer_requests [last_model_id ] = m_subrequests[real_idx];
1064+ << last_model_idx << " ]" );
1065+ submodel_desc .pyramid_infer_requests [last_model_idx ] = m_subrequests[real_idx];
10761066 if (is_piped) {
1077- proto_comp_model_desc .pyramid_pipeline_requests [last_model_id ] = m_funcall_pipeline[real_idx].subrequest ;
1067+ submodel_desc .pyramid_pipeline_requests [last_model_idx ] = m_funcall_pipeline[real_idx].subrequest ;
10781068 }
10791069 }
10801070
1081- if (!is_recreate && compiled_models. size () > 0 ) {
1082- LOG_INFO (" Successfully created " << (compiled_models. size () - 1 )
1071+ if (!is_recreate && num_pyramid_models > 0 ) {
1072+ LOG_INFO (" Successfully created " << (num_pyramid_models - 1 )
10831073 << " new pyramid infer requests and reused 1 original request" );
10841074 }
10851075}
0 commit comments