Skip to content

Commit 6f6508d

Browse files
committed
Refine JustInferRequest.
Signed-off-by: intelgaoxiong <[email protected]>
1 parent bdee912 commit 6f6508d

File tree

1 file changed

+8
-89
lines changed

1 file changed

+8
-89
lines changed

src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp

Lines changed: 8 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -185,23 +185,10 @@ ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) {
185185
ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
186186
: IBaseInferRequest(compiled_model),
187187
m_func_mem_mgr(compiled_model) {
188-
// Memory tracking: Record initial RSS
189-
size_t initial_memory_kb = ov::npuw::get_process_memory_kb();
190-
std::cout << "\n=== JustInferRequest Constructor Memory Tracking Start: " << initial_memory_kb
191-
<< " KB RSS ===" << std::endl;
192-
193188
using namespace std::placeholders;
194-
195-
// Step 1: Function memory manager setup
196-
size_t before_func_mgr_kb = ov::npuw::get_process_memory_kb();
197189
m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3));
198190
m_func_mem_mgr.assign_memory();
199-
size_t after_func_mgr_kb = ov::npuw::get_process_memory_kb();
200-
std::cout << "[Step 1] Function memory manager setup: +" << (after_func_mgr_kb - before_func_mgr_kb) << " KB"
201-
<< std::endl;
202191

203-
// Step 2: Configuration setup
204-
size_t before_config_kb = ov::npuw::get_process_memory_kb();
205192
m_closure_update_required = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FOLD>();
206193
m_use_function_pipelining = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
207194
if (m_use_function_pipelining) {
@@ -212,11 +199,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
212199

213200
m_spatial_io.resize(m_num_submodels);
214201
m_attention_io.resize(m_num_submodels);
215-
size_t after_config_kb = ov::npuw::get_process_memory_kb();
216-
std::cout << "[Step 2] Configuration and resize: +" << (after_config_kb - before_config_kb) << " KB" << std::endl;
217202

218-
// Step 3: Create infer requests loop
219-
size_t before_infer_requests_kb = ov::npuw::get_process_memory_kb();
220203
// Create infer requests
221204
// Preallocate funcall tensors & substitute function call requests
222205
bool failover_happened = false;
@@ -225,10 +208,6 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
225208
bool has_pyramid = false;
226209
std::size_t dynamic_sub_idx = -1;
227210
std::size_t pyramid_sub_idx = -1;
228-
229-
size_t total_create_infer_request_kb = 0;
230-
size_t total_pyramid_setup_kb = 0;
231-
232211
for (size_t i = 0; i < m_num_submodels; i++) {
233212
LOG_INFO("Creating infer request for Subgraph[" << i << "]...");
234213
LOG_BLOCK();
@@ -311,15 +290,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
311290
// Special cases are handled -- so nothing to do here
312291
const bool is_piped = is_pipelined(i);
313292
bool recompiled = false;
314-
315-
// Measure create_infer_requests impact
316-
size_t before_create_rq_kb = ov::npuw::get_process_memory_kb();
317293
auto rqs = create_infer_requests(i, is_piped ? 2 : 1, &recompiled);
318-
size_t after_create_rq_kb = ov::npuw::get_process_memory_kb();
319-
size_t create_rq_increase_kb =
320-
(after_create_rq_kb > before_create_rq_kb) ? (after_create_rq_kb - before_create_rq_kb) : 0;
321-
total_create_infer_request_kb += create_rq_increase_kb;
322-
323294
failover_happened |= recompiled;
324295
m_subrequests[i] = rqs.at(0);
325296
m_subrequest_devices[i] = *comp_model_desc.device_it;
@@ -330,29 +301,12 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
330301
// Create infer requests for pyramid attention models if present (only for function calls)
331302
if (comp_model_desc.replaced_by) {
332303
const auto real_idx = comp_model_desc.replaced_by.value();
333-
334-
// Measure pyramid setup impact
335-
size_t before_pyramid_kb = ov::npuw::get_process_memory_kb();
336304
setup_pyramid_infer_requests(real_idx, is_piped, false);
337-
size_t after_pyramid_kb = ov::npuw::get_process_memory_kb();
338-
size_t pyramid_increase_kb =
339-
(after_pyramid_kb > before_pyramid_kb) ? (after_pyramid_kb - before_pyramid_kb) : 0;
340-
total_pyramid_setup_kb += pyramid_increase_kb;
341-
342-
if (pyramid_increase_kb > 0) {
343-
std::cout << " [Subgraph " << i << "] Pyramid setup: +" << pyramid_increase_kb << " KB" << std::endl;
344-
}
345305
}
346306

347307
LOG_INFO("DONE");
348308
} // for(submodels)
349309

350-
size_t after_infer_requests_kb = ov::npuw::get_process_memory_kb();
351-
std::cout << "[Step 3] Create infer requests loop TOTAL: +" << (after_infer_requests_kb - before_infer_requests_kb)
352-
<< " KB" << std::endl;
353-
std::cout << " - create_infer_requests cumulative: +" << total_create_infer_request_kb << " KB" << std::endl;
354-
std::cout << " - pyramid setup cumulative: +" << total_pyramid_setup_kb << " KB" << std::endl;
355-
356310
if (failover_happened) {
357311
LOG_INFO("Refined device distribution:");
358312
LOG_BLOCK();
@@ -390,17 +344,10 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
390344
}
391345
} // if(function_pipelining)
392346

393-
// Step 4: alloc_io, connect_subrequests, init_gio
394-
size_t before_connections_kb = ov::npuw::get_process_memory_kb();
395347
alloc_io();
396348
connect_subrequests();
397349
init_gio();
398-
size_t after_connections_kb = ov::npuw::get_process_memory_kb();
399-
std::cout << "[Step 4] alloc_io + connect_subrequests + init_gio: +"
400-
<< (after_connections_kb - before_connections_kb) << " KB" << std::endl;
401350

402-
// Step 5: Preemptive tensor setting
403-
size_t before_tensors_kb = ov::npuw::get_process_memory_kb();
404351
for (size_t i = 0; i < m_num_submodels; i++) {
405352
LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]...");
406353
LOG_BLOCK();
@@ -420,12 +367,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
420367
}
421368
LOG_VERB("Done");
422369
}
423-
size_t after_tensors_kb = ov::npuw::get_process_memory_kb();
424-
std::cout << "[Step 5] Preemptive tensor setting: +" << (after_tensors_kb - before_tensors_kb) << " KB"
425-
<< std::endl;
426370

427371
// Handle spatial dynamic submission
428-
size_t before_spatial_kb = ov::npuw::get_process_memory_kb();
429372
if (has_spatial) {
430373
if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_SPATIAL_DYN>()) {
431374
LOG_VERB("Finding spatial features...");
@@ -442,11 +385,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
442385
}
443386
LOG_VERB("Done");
444387
}
445-
size_t after_spatial_kb = ov::npuw::get_process_memory_kb();
446-
std::cout << "[Step 6] Spatial selector setup: +" << (after_spatial_kb - before_spatial_kb) << " KB" << std::endl;
447388

448389
// Handle dynamic submission
449-
size_t before_dynamic_kb = ov::npuw::get_process_memory_kb();
450390
if (has_dynamic) {
451391
if (!m_npuw_model->m_cfg.get<::intel_npu::NPUW_ATTN_DYN>()) {
452392
// Even if the attention is detected and ready to go dynamic,
@@ -463,12 +403,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
463403
}
464404
LOG_VERB("Done");
465405
}
466-
size_t after_dynamic_kb = ov::npuw::get_process_memory_kb();
467-
std::cout << "[Step 7] Dynamic attention selector setup: +" << (after_dynamic_kb - before_dynamic_kb) << " KB"
468-
<< std::endl;
469406

470407
// Handle pyramid attention
471-
size_t before_pyramid_selector_kb = ov::npuw::get_process_memory_kb();
472408
if (has_pyramid) {
473409
const auto& pyramid_dyn = m_npuw_model->m_compiled_submodels.at(pyramid_sub_idx).pyramid_attention.value();
474410
const auto pyramid_count = pyramid_dyn._compiled_models.size();
@@ -485,17 +421,6 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
485421
}
486422
}
487423
}
488-
size_t after_pyramid_selector_kb = ov::npuw::get_process_memory_kb();
489-
std::cout << "[Step 8] Pyramid selector setup: +" << (after_pyramid_selector_kb - before_pyramid_selector_kb)
490-
<< " KB" << std::endl;
491-
492-
// Final summary
493-
size_t final_memory_kb = ov::npuw::get_process_memory_kb();
494-
size_t total_increase_kb = (final_memory_kb > initial_memory_kb) ? (final_memory_kb - initial_memory_kb) : 0;
495-
std::cout << "=== JustInferRequest Constructor Memory Tracking End ===" << std::endl;
496-
std::cout << "Total RSS increase: " << total_increase_kb << " KB (" << (total_increase_kb / 1024) << " MB)"
497-
<< std::endl;
498-
std::cout << "Final RSS: " << final_memory_kb << " KB (" << (final_memory_kb / 1024) << " MB)\n" << std::endl;
499424
}
500425

501426
void ov::npuw::JustInferRequest::set_tensor(const ov::Output<const ov::Node>& port,
@@ -610,7 +535,6 @@ void ov::npuw::JustInferRequest::prepare_for_infer() {
610535

611536
// Get the pyramid model ID based on current sequence length (updated in prepare())
612537
auto pyramid_id = m_pyramid_selector->pyramid_id();
613-
std::cout << "Switch to pyramid id: " << pyramid_id << std::endl;
614538

615539
for (auto&& id : m_funcall_heads) {
616540
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[id];
@@ -711,19 +635,12 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
711635
const bool is_dynamic = func_desc.attention.has_value();
712636
const bool is_pyramid = func_desc.pyramid_attention.has_value();
713637

714-
const auto non_dynamic_act_in = [](const ov::npuw::compiled::Attention& d, std::size_t in_idx) {
715-
const bool not_param = std::none_of(d.params.begin(), d.params.end(), [&](auto&& p) {
716-
return p.idx == in_idx;
717-
});
718-
const bool not_mask = in_idx != d.mask_idx;
719-
return not_param && not_mask;
720-
};
721-
722-
const auto non_pyramid_act_in = [](const ov::npuw::compiled::PyramidAttentionInfo& d, std::size_t in_idx) {
723-
const bool not_param = std::none_of(d.params.begin(), d.params.end(), [&](auto&& p) {
638+
// Generalized: check if input is neither param nor mask
639+
auto is_non_param_mask = [](const auto& info, std::size_t in_idx) {
640+
const bool not_param = std::none_of(info.params.begin(), info.params.end(), [&](auto&& p) {
724641
return p.idx == in_idx;
725642
});
726-
const bool not_mask = in_idx != d.mask_idx;
643+
const bool not_mask = in_idx != info.mask_idx;
727644
return not_param && not_mask;
728645
};
729646

@@ -759,14 +676,16 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
759676
m_spatial_io[real_idx].inputs.at(i) = i_tensor;
760677
} else if (is_dynamic) {
761678
// Set tensor only if it is non-dynamic (dynamic are managed by the infer_dynamic)
762-
if (non_dynamic_act_in(*func_desc.attention, i)) {
679+
if (is_non_param_mask(*func_desc.attention, i)) {
763680
m_subrequests[real_idx]->set_tensor(iport, i_tensor);
764681
} else {
765682
m_attention_io[idx].inputs.at(i) = i_tensor;
766683
}
767684
} else if (is_pyramid) {
685+
// Pyramid attention
768686
auto pyramid_id = m_pyramid_selector->pyramid_id();
769-
if (non_pyramid_act_in(func_desc.pyramid_attention.value()._attention_infos[pyramid_id], i)) {
687+
const auto& info = func_desc.pyramid_attention.value()._attention_infos[pyramid_id];
688+
if (is_non_param_mask(info, i)) {
770689
m_subrequests[real_idx]->set_tensor(iport, i_tensor);
771690
} else {
772691
m_attention_io[idx].inputs.at(i) = i_tensor;

0 commit comments

Comments
 (0)