@@ -185,23 +185,10 @@ ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) {
185185ov::npuw::JustInferRequest::JustInferRequest (const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
186186 : IBaseInferRequest(compiled_model),
187187 m_func_mem_mgr(compiled_model) {
188- // Memory tracking: Record initial RSS
189- size_t initial_memory_kb = ov::npuw::get_process_memory_kb ();
190- std::cout << " \n === JustInferRequest Constructor Memory Tracking Start: " << initial_memory_kb
191- << " KB RSS ===" << std::endl;
192-
193188 using namespace std ::placeholders;
194-
195- // Step 1: Function memory manager setup
196- size_t before_func_mgr_kb = ov::npuw::get_process_memory_kb ();
197189 m_func_mem_mgr.set_alloc (std::bind (&JustInferRequest::allocMem, this , _1, _2, _3));
198190 m_func_mem_mgr.assign_memory ();
199- size_t after_func_mgr_kb = ov::npuw::get_process_memory_kb ();
200- std::cout << " [Step 1] Function memory manager setup: +" << (after_func_mgr_kb - before_func_mgr_kb) << " KB"
201- << std::endl;
202191
203- // Step 2: Configuration setup
204- size_t before_config_kb = ov::npuw::get_process_memory_kb ();
205192 m_closure_update_required = m_npuw_model->m_cfg .get <::intel_npu::NPUW_FOLD>();
206193 m_use_function_pipelining = m_npuw_model->m_cfg .get <::intel_npu::NPUW_FUNCALL_ASYNC>();
207194 if (m_use_function_pipelining) {
@@ -212,11 +199,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
212199
213200 m_spatial_io.resize (m_num_submodels);
214201 m_attention_io.resize (m_num_submodels);
215- size_t after_config_kb = ov::npuw::get_process_memory_kb ();
216- std::cout << " [Step 2] Configuration and resize: +" << (after_config_kb - before_config_kb) << " KB" << std::endl;
217202
218- // Step 3: Create infer requests loop
219- size_t before_infer_requests_kb = ov::npuw::get_process_memory_kb ();
220203 // Create infer requests
221204 // Preallocate funcall tensors & substitute function call requests
222205 bool failover_happened = false ;
@@ -225,10 +208,6 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
225208 bool has_pyramid = false ;
226209 std::size_t dynamic_sub_idx = -1 ;
227210 std::size_t pyramid_sub_idx = -1 ;
228-
229- size_t total_create_infer_request_kb = 0 ;
230- size_t total_pyramid_setup_kb = 0 ;
231-
232211 for (size_t i = 0 ; i < m_num_submodels; i++) {
233212 LOG_INFO (" Creating infer request for Subgraph[" << i << " ]..." );
234213 LOG_BLOCK ();
@@ -311,15 +290,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
311290 // Special cases are handled -- so nothing to do here
312291 const bool is_piped = is_pipelined (i);
313292 bool recompiled = false ;
314-
315- // Measure create_infer_requests impact
316- size_t before_create_rq_kb = ov::npuw::get_process_memory_kb ();
317293 auto rqs = create_infer_requests (i, is_piped ? 2 : 1 , &recompiled);
318- size_t after_create_rq_kb = ov::npuw::get_process_memory_kb ();
319- size_t create_rq_increase_kb =
320- (after_create_rq_kb > before_create_rq_kb) ? (after_create_rq_kb - before_create_rq_kb) : 0 ;
321- total_create_infer_request_kb += create_rq_increase_kb;
322-
323294 failover_happened |= recompiled;
324295 m_subrequests[i] = rqs.at (0 );
325296 m_subrequest_devices[i] = *comp_model_desc.device_it ;
@@ -330,29 +301,12 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
330301 // Create infer requests for pyramid attention models if present (only for function calls)
331302 if (comp_model_desc.replaced_by ) {
332303 const auto real_idx = comp_model_desc.replaced_by .value ();
333-
334- // Measure pyramid setup impact
335- size_t before_pyramid_kb = ov::npuw::get_process_memory_kb ();
336304 setup_pyramid_infer_requests (real_idx, is_piped, false );
337- size_t after_pyramid_kb = ov::npuw::get_process_memory_kb ();
338- size_t pyramid_increase_kb =
339- (after_pyramid_kb > before_pyramid_kb) ? (after_pyramid_kb - before_pyramid_kb) : 0 ;
340- total_pyramid_setup_kb += pyramid_increase_kb;
341-
342- if (pyramid_increase_kb > 0 ) {
343- std::cout << " [Subgraph " << i << " ] Pyramid setup: +" << pyramid_increase_kb << " KB" << std::endl;
344- }
345305 }
346306
347307 LOG_INFO (" DONE" );
348308 } // for(submodels)
349309
350- size_t after_infer_requests_kb = ov::npuw::get_process_memory_kb ();
351- std::cout << " [Step 3] Create infer requests loop TOTAL: +" << (after_infer_requests_kb - before_infer_requests_kb)
352- << " KB" << std::endl;
353- std::cout << " - create_infer_requests cumulative: +" << total_create_infer_request_kb << " KB" << std::endl;
354- std::cout << " - pyramid setup cumulative: +" << total_pyramid_setup_kb << " KB" << std::endl;
355-
356310 if (failover_happened) {
357311 LOG_INFO (" Refined device distribution:" );
358312 LOG_BLOCK ();
@@ -390,17 +344,10 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
390344 }
391345 } // if(function_pipelining)
392346
393- // Step 4: alloc_io, connect_subrequests, init_gio
394- size_t before_connections_kb = ov::npuw::get_process_memory_kb ();
395347 alloc_io ();
396348 connect_subrequests ();
397349 init_gio ();
398- size_t after_connections_kb = ov::npuw::get_process_memory_kb ();
399- std::cout << " [Step 4] alloc_io + connect_subrequests + init_gio: +"
400- << (after_connections_kb - before_connections_kb) << " KB" << std::endl;
401350
402- // Step 5: Preemptive tensor setting
403- size_t before_tensors_kb = ov::npuw::get_process_memory_kb ();
404351 for (size_t i = 0 ; i < m_num_submodels; i++) {
405352 LOG_VERB (" Trying to preemptively set tensors for Subgraph[" << i << " ]..." );
406353 LOG_BLOCK ();
@@ -420,12 +367,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
420367 }
421368 LOG_VERB (" Done" );
422369 }
423- size_t after_tensors_kb = ov::npuw::get_process_memory_kb ();
424- std::cout << " [Step 5] Preemptive tensor setting: +" << (after_tensors_kb - before_tensors_kb) << " KB"
425- << std::endl;
426370
427371 // Handle spatial dynamic submission
428- size_t before_spatial_kb = ov::npuw::get_process_memory_kb ();
429372 if (has_spatial) {
430373 if (m_npuw_model->m_cfg .get <::intel_npu::NPUW_SPATIAL_DYN>()) {
431374 LOG_VERB (" Finding spatial features..." );
@@ -442,11 +385,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
442385 }
443386 LOG_VERB (" Done" );
444387 }
445- size_t after_spatial_kb = ov::npuw::get_process_memory_kb ();
446- std::cout << " [Step 6] Spatial selector setup: +" << (after_spatial_kb - before_spatial_kb) << " KB" << std::endl;
447388
448389 // Handle dynamic submission
449- size_t before_dynamic_kb = ov::npuw::get_process_memory_kb ();
450390 if (has_dynamic) {
451391 if (!m_npuw_model->m_cfg .get <::intel_npu::NPUW_ATTN_DYN>()) {
452392 // Even if the attention is detected and ready to go dynamic,
@@ -463,12 +403,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
463403 }
464404 LOG_VERB (" Done" );
465405 }
466- size_t after_dynamic_kb = ov::npuw::get_process_memory_kb ();
467- std::cout << " [Step 7] Dynamic attention selector setup: +" << (after_dynamic_kb - before_dynamic_kb) << " KB"
468- << std::endl;
469406
470407 // Handle pyramid attention
471- size_t before_pyramid_selector_kb = ov::npuw::get_process_memory_kb ();
472408 if (has_pyramid) {
473409 const auto & pyramid_dyn = m_npuw_model->m_compiled_submodels .at (pyramid_sub_idx).pyramid_attention .value ();
474410 const auto pyramid_count = pyramid_dyn._compiled_models .size ();
@@ -485,17 +421,6 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
485421 }
486422 }
487423 }
488- size_t after_pyramid_selector_kb = ov::npuw::get_process_memory_kb ();
489- std::cout << " [Step 8] Pyramid selector setup: +" << (after_pyramid_selector_kb - before_pyramid_selector_kb)
490- << " KB" << std::endl;
491-
492- // Final summary
493- size_t final_memory_kb = ov::npuw::get_process_memory_kb ();
494- size_t total_increase_kb = (final_memory_kb > initial_memory_kb) ? (final_memory_kb - initial_memory_kb) : 0 ;
495- std::cout << " === JustInferRequest Constructor Memory Tracking End ===" << std::endl;
496- std::cout << " Total RSS increase: " << total_increase_kb << " KB (" << (total_increase_kb / 1024 ) << " MB)"
497- << std::endl;
498- std::cout << " Final RSS: " << final_memory_kb << " KB (" << (final_memory_kb / 1024 ) << " MB)\n " << std::endl;
499424}
500425
501426void ov::npuw::JustInferRequest::set_tensor (const ov::Output<const ov::Node>& port,
@@ -610,7 +535,6 @@ void ov::npuw::JustInferRequest::prepare_for_infer() {
610535
611536 // Get the pyramid model ID based on current sequence length (updated in prepare())
612537 auto pyramid_id = m_pyramid_selector->pyramid_id ();
613- std::cout << " Switch to pyramid id: " << pyramid_id << std::endl;
614538
615539 for (auto && id : m_funcall_heads) {
616540 auto & comp_model_desc = m_npuw_model->m_compiled_submodels [id];
@@ -711,19 +635,12 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
711635 const bool is_dynamic = func_desc.attention .has_value ();
712636 const bool is_pyramid = func_desc.pyramid_attention .has_value ();
713637
714- const auto non_dynamic_act_in = [](const ov::npuw::compiled::Attention& d, std::size_t in_idx) {
715- const bool not_param = std::none_of (d.params .begin (), d.params .end (), [&](auto && p) {
716- return p.idx == in_idx;
717- });
718- const bool not_mask = in_idx != d.mask_idx ;
719- return not_param && not_mask;
720- };
721-
722- const auto non_pyramid_act_in = [](const ov::npuw::compiled::PyramidAttentionInfo& d, std::size_t in_idx) {
723- const bool not_param = std::none_of (d.params .begin (), d.params .end (), [&](auto && p) {
638+ // Generalized: check if input is neither param nor mask
639+ auto is_non_param_mask = [](const auto & info, std::size_t in_idx) {
640+ const bool not_param = std::none_of (info.params .begin (), info.params .end (), [&](auto && p) {
724641 return p.idx == in_idx;
725642 });
726- const bool not_mask = in_idx != d .mask_idx ;
643+ const bool not_mask = in_idx != info .mask_idx ;
727644 return not_param && not_mask;
728645 };
729646
@@ -759,14 +676,16 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
759676 m_spatial_io[real_idx].inputs .at (i) = i_tensor;
760677 } else if (is_dynamic) {
761678 // Set tensor only if it is non-dynamic (dynamic are managed by the infer_dynamic)
762- if (non_dynamic_act_in (*func_desc.attention , i)) {
679+ if (is_non_param_mask (*func_desc.attention , i)) {
763680 m_subrequests[real_idx]->set_tensor (iport, i_tensor);
764681 } else {
765682 m_attention_io[idx].inputs .at (i) = i_tensor;
766683 }
767684 } else if (is_pyramid) {
685+ // Pyramid attention
768686 auto pyramid_id = m_pyramid_selector->pyramid_id ();
769- if (non_pyramid_act_in (func_desc.pyramid_attention .value ()._attention_infos [pyramid_id], i)) {
687+ const auto & info = func_desc.pyramid_attention .value ()._attention_infos [pyramid_id];
688+ if (is_non_param_mask (info, i)) {
770689 m_subrequests[real_idx]->set_tensor (iport, i_tensor);
771690 } else {
772691 m_attention_io[idx].inputs .at (i) = i_tensor;
0 commit comments