From 858a852d63146d62139c79cd8292c4e090aa672e Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 8 Nov 2024 14:26:12 -0500 Subject: [PATCH 1/8] Provide mechanism to discard data Add a function `parsec_data_discard` that releases the data such that the host copy remains intact but does not prevent destruction of the data once all device copies have been released. This keeps the host copy available for device copies to inspect and avoids potential race conditions in the release process. During an eviction, copies of data with a discarded host copy are not transfered but put directly into the lru. Signed-off-by: Joseph Schuchart --- parsec/data.c | 31 ++++++++++- parsec/data.h | 10 ++++ parsec/mca/device/device_gpu.c | 90 ++++++++++++++++---------------- parsec/mca/device/transfer_gpu.c | 13 ++++- 4 files changed, 97 insertions(+), 47 deletions(-) diff --git a/parsec/data.c b/parsec/data.c index 8dffaa027..f49d74965 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -194,7 +194,11 @@ int parsec_data_copy_detach(parsec_data_t* data, copy->original = NULL; copy->older = NULL; - PARSEC_OBJ_RELEASE(data); + /* if the host copy is discarded it has already released its reference so + * we do not release the data again */ + if (!(copy->flags & PARSEC_DATA_FLAG_DISCARDED)) { + PARSEC_OBJ_RELEASE(data); + } return PARSEC_SUCCESS; } @@ -559,3 +563,28 @@ parsec_data_destroy( parsec_data_t *data ) #endif PARSEC_OBJ_RELEASE(data); } + +void +parsec_data_discard( parsec_data_t *data ) +{ + + /* first release the reference the application held */ + PARSEC_OBJ_RELEASE(data); + + /* second, mark the host copy as discarded */ + parsec_data_copy_t *cpu_copy = data->device_copies[0]; + if (NULL != cpu_copy) { + cpu_copy->flags = PARSEC_DATA_FLAG_DISCARDED; + } + + /* third: release the reference that the host copy had on the data_t to break + * the circular reference. */ + PARSEC_OBJ_RELEASE(data); + + /* From here, any device copy that is still attached to the data_t + * can continue to use the host copy and once all device copies are + * detached the data_t and the host copy are destroyed. + * If there were no device copies then the release above will + * have destroyed the data_t already. */ + +} diff --git a/parsec/data.h b/parsec/data.h index e94d56df5..8cf5e9905 100644 --- a/parsec/data.h +++ b/parsec/data.h @@ -55,6 +55,7 @@ typedef uint8_t parsec_data_status_t; typedef uint8_t parsec_data_flag_t; #define PARSEC_DATA_FLAG_ARENA ((parsec_data_flag_t)1<<0) #define PARSEC_DATA_FLAG_TRANSIT ((parsec_data_flag_t)1<<1) +#define PARSEC_DATA_FLAG_DISCARDED ((parsec_data_flag_t)1<<2) #define PARSEC_DATA_FLAG_PARSEC_MANAGED ((parsec_data_flag_t)1<<6) #define PARSEC_DATA_FLAG_PARSEC_OWNED ((parsec_data_flag_t)1<<7) @@ -144,6 +145,15 @@ parsec_data_create_with_type( parsec_data_collection_t *desc, PARSEC_DECLSPEC void parsec_data_destroy( parsec_data_t *holder ); +/** + * Discard the parsec_data_t. Any host-side copies will remain + * allocated as long as there are potential device copies referencing it. + * Once it is safe, all copies are dereferenced and eventually destroyed. + * The parsec_data_t must not be used after this call. + */ +PARSEC_DECLSPEC void +parsec_data_discard( parsec_data_t *data ); + END_C_DECLS /** @} */ diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index d932e975e..0d9289584 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -400,6 +400,35 @@ static parsec_task_class_t parsec_device_data_prefetch_tc = { .fini = NULL }; +/** + * Release a gpu copy to the zone allocator. + */ +static void +parsec_device_release_gpu_copy(parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t *gpu_elem) +{ +#if !defined(PARSEC_GPU_ALLOC_PER_TILE) +#if defined(PARSEC_PROF_TRACE) + if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) && + (gpu_device->exec_stream[0]->prof_event_track_enable || + gpu_device->exec_stream[1]->prof_event_track_enable)) { + parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, + parsec_gpu_free_memory_key, (int64_t)gpu_elem->device_private, + gpu_device->super.device_index, + NULL, PARSEC_PROFILING_EVENT_COUNTER); + parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, + parsec_gpu_use_memory_key_end, + (uint64_t)gpu_elem->device_private, + gpu_device->super.device_index, NULL, 0); + } +#endif // PARSEC_PROF_TRACE + assert( 0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); + zone_free( gpu_device->memory, (void*)(gpu_elem->device_private) ); + gpu_elem->device_private = NULL; + PARSEC_OBJ_RELEASE(gpu_elem); + assert( NULL == gpu_elem ); +#endif // PARSEC_GPU_ALLOC_PER_TILE +} + static int parsec_device_release_resources_prefetch_task(parsec_device_gpu_module_t* gpu_device, parsec_gpu_task_t** out_task) @@ -742,24 +771,6 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de #if defined(PARSEC_GPU_ALLOC_PER_TILE) gpu_device->memory_free( gpu_copy->device_private ); -#else - -#if defined(PARSEC_PROF_TRACE) - if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) && - (gpu_device->exec_stream[0]->prof_event_track_enable || - gpu_device->exec_stream[1]->prof_event_track_enable)) { - parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, - parsec_gpu_free_memory_key, (int64_t)gpu_copy->device_private, - gpu_device->super.device_index, - NULL, PARSEC_PROFILING_EVENT_COUNTER); - parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, - parsec_gpu_use_memory_key_end, - (uint64_t)gpu_copy->device_private, - gpu_device->super.device_index, NULL, 0); - } -#endif - zone_free( gpu_device->memory, (void*)gpu_copy->device_private ); -#endif gpu_copy->device_private = NULL; /* At this point the data copies should have no attachment to a data_t. Thus, @@ -767,6 +778,9 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de * collection must have been called, releasing all the copies. */ PARSEC_OBJ_RELEASE(gpu_copy); assert(NULL == gpu_copy); +#else + parsec_device_release_gpu_copy(gpu_device, gpu_copy); +#endif } } @@ -1069,34 +1083,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, #if !defined(PARSEC_GPU_ALLOC_PER_TILE) /* Let's free this space, and try again to malloc some space */ PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, - "GPU[%d:%s] Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p", - gpu_device->super.device_index, gpu_device->super.name, + "GPU[%d:%s]:%s Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p", + gpu_device->super.device_index, gpu_device->super.name, task_name, lru_gpu_elem, lru_gpu_elem->device_private, lru_gpu_elem->super.super.obj_reference_count, oldmaster); -#if defined(PARSEC_PROF_TRACE) - if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) && - (gpu_device->exec_stream[0]->prof_event_track_enable || - gpu_device->exec_stream[1]->prof_event_track_enable)) { - parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, - parsec_gpu_free_memory_key, (int64_t)lru_gpu_elem->device_private, - gpu_device->super.device_index, - NULL, PARSEC_PROFILING_EVENT_COUNTER); - parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, - parsec_gpu_use_memory_key_end, - (uint64_t)lru_gpu_elem->device_private, - gpu_device->super.device_index, NULL, 0); - } -#endif - assert( 0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); - zone_free( gpu_device->memory, (void*)(lru_gpu_elem->device_private) ); - lru_gpu_elem->device_private = NULL; + parsec_device_release_gpu_copy(gpu_device, lru_gpu_elem); data_avail_epoch++; - PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream, - "GPU[%d:%s]:%s: Release LRU-retrieved GPU copy %p [ref_count %d: must be 1]", - gpu_device->super.device_index, gpu_device->super.name, task_name, - lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count); - PARSEC_OBJ_RELEASE(lru_gpu_elem); - assert( NULL == lru_gpu_elem ); goto malloc_data; } PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream, @@ -2342,7 +2334,17 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device, assert( 0 <= gpu_copy->readers ); - if( gpu_task->pushout & (1 << i) ) { + if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, + "GPU[%d:%s] Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p", + gpu_device->super.device_index, gpu_device->super.name, + gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count, + original); + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(gpu_copy); + /* release the original and */ + parsec_device_release_gpu_copy(gpu_device, gpu_copy); + } else if( gpu_task->pushout & (1 << i)) { PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU copy %p [ref_count %d] moved to the read LRU in %s", gpu_copy, gpu_copy->super.super.obj_reference_count, __func__); diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c index 5d0afb61b..e98590fc0 100644 --- a/parsec/mca/device/transfer_gpu.c +++ b/parsec/mca/device/transfer_gpu.c @@ -227,7 +227,7 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, { parsec_gpu_task_t *w2r_task = NULL; parsec_gpu_d2h_task_t *d2h_task = NULL; - parsec_gpu_data_copy_t *gpu_copy; + parsec_gpu_data_copy_t *gpu_copy, *cpu_copy; parsec_list_item_t* item = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next; int nb_cleaned = 0; @@ -239,10 +239,19 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, break; } gpu_copy = (parsec_gpu_data_copy_t*)item; + cpu_copy = gpu_copy->original->device_copies[0]; parsec_atomic_lock( &gpu_copy->original->lock ); /* get the next item before altering the next pointer */ item = (parsec_list_item_t*)item->list_next; /* conversion needed for volatile */ - if( 0 == gpu_copy->readers ) { + if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(gpu_copy); + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, + "D2H[%d:%s] GPU data copy %p of discarded data %p now available", + gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original); + parsec_atomic_unlock( &gpu_copy->original->lock ); + parsec_list_push_back(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_copy); + } else if( 0 == gpu_copy->readers ) { if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* allocate on-demand */ d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool); if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* we're running out of memory. Bail out. */ From e09cd5ca11dbad31d56b926e09eaa5bfb6ba578b Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 13 Nov 2024 18:20:55 -0500 Subject: [PATCH 2/8] parsec_data_t may be destroyed without references left Otherwise we cannot destroy empty or discarded data. Signed-off-by: Joseph Schuchart --- parsec/data.c | 1 - 1 file changed, 1 deletion(-) diff --git a/parsec/data.c b/parsec/data.c index f49d74965..052c886dc 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -90,7 +90,6 @@ static void parsec_data_destruct(parsec_data_t* obj ) copy, copy->original, i); } #endif /* defined(PARSEC_DEBUG_PARANOID) */ - assert(obj->super.obj_reference_count > 1); parsec_data_copy_detach( obj, copy, i ); if ( !(device->type & PARSEC_DEV_CUDA) && !(device->type & PARSEC_DEV_HIP) ) { From 0644c6ce5824be1a59a6d9f2e4d17969d43a2f98 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 15 Nov 2024 11:41:37 -0500 Subject: [PATCH 3/8] Release device copy if the data has been discarded Signed-off-by: Joseph Schuchart --- parsec/mca/device/device_gpu.c | 3 +-- parsec/mca/device/device_gpu.h | 6 ++++++ parsec/mca/device/transfer_gpu.c | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 0d9289584..da979d014 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -403,8 +403,7 @@ static parsec_task_class_t parsec_device_data_prefetch_tc = { /** * Release a gpu copy to the zone allocator. */ -static void -parsec_device_release_gpu_copy(parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t *gpu_elem) +void parsec_device_release_gpu_copy(parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t *gpu_elem) { #if !defined(PARSEC_GPU_ALLOC_PER_TILE) #if defined(PARSEC_PROF_TRACE) diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h index fa25b87a3..fa8c3517f 100644 --- a/parsec/mca/device/device_gpu.h +++ b/parsec/mca/device/device_gpu.h @@ -352,6 +352,12 @@ int parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int parsec_device_flush_lru( parsec_device_module_t *device ); int parsec_device_memory_release( parsec_device_gpu_module_t* gpu_device ); + +/** + * Release a gpu copy and return its memory to the zone allocator of the device. + */ +void parsec_device_release_gpu_copy(parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t *gpu_elem); + /** * This version is based on 4 streams: one for transfers from the memory to * the GPU, 2 for kernel executions and one for transfers from the GPU into diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c index e98590fc0..8f9c3596b 100644 --- a/parsec/mca/device/transfer_gpu.c +++ b/parsec/mca/device/transfer_gpu.c @@ -247,10 +247,10 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); PARSEC_LIST_ITEM_SINGLETON(gpu_copy); PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, - "D2H[%d:%s] GPU data copy %p of discarded data %p now available", + "D2H[%d:%s] GPU data copy %p of discarded data %p will be released", gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original); parsec_atomic_unlock( &gpu_copy->original->lock ); - parsec_list_push_back(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_copy); + parsec_device_release_gpu_copy(gpu_device, gpu_copy); } else if( 0 == gpu_copy->readers ) { if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* allocate on-demand */ d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool); From 60818f071d494d58641fdec06d60886252f5bd06 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 15 Nov 2024 11:55:26 -0500 Subject: [PATCH 4/8] Improve docs for parsec_data_discard() Signed-off-by: Joseph Schuchart --- parsec/data.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/parsec/data.h b/parsec/data.h index 8cf5e9905..ceaf19483 100644 --- a/parsec/data.h +++ b/parsec/data.h @@ -146,9 +146,11 @@ PARSEC_DECLSPEC void parsec_data_destroy( parsec_data_t *holder ); /** - * Discard the parsec_data_t. Any host-side copies will remain - * allocated as long as there are potential device copies referencing it. - * Once it is safe, all copies are dereferenced and eventually destroyed. + * Mark the parsec_data_t and its host copy as discarded. + * Any host-side copies will remain allocated as long as + * there are potential device copies referencing it. + * Once all device copies have been released the host + * copy and the data_t are destroyed. * The parsec_data_t must not be used after this call. */ PARSEC_DECLSPEC void From 5665d15416cd5b684718e1befe5c15dfebcafbef Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 16 Nov 2024 13:34:23 -0500 Subject: [PATCH 5/8] Lock data_t before marking the host copy discarded Also OR the flag instead of assigning it. Signed-off-by: Joseph Schuchart --- parsec/data.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/parsec/data.c b/parsec/data.c index 052c886dc..f81b32256 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -566,18 +566,36 @@ parsec_data_destroy( parsec_data_t *data ) void parsec_data_discard( parsec_data_t *data ) { + /* defensive */ + if (NULL == data) return; - /* first release the reference the application held */ - PARSEC_OBJ_RELEASE(data); + /* lock the data so it's safe to touch the flags */ + parsec_atomic_lock( &data->lock ); - /* second, mark the host copy as discarded */ + /** + * Mark the host copy as discarded + * + * We mark the host copy as having given up its reference to the data_t + * so when the data_t is destroyed (parsec_data_destruct) and + * the host copy is being detached we don't release the copy's reference + * on the data_t again. We have to releae the copy's reference here + * to break the cyclic dependency between the copy and the data_t. + * We cannot release the copy immediately as there may device management + * threads working with it, e.g., evicting data into it. + * */ parsec_data_copy_t *cpu_copy = data->device_copies[0]; if (NULL != cpu_copy) { - cpu_copy->flags = PARSEC_DATA_FLAG_DISCARDED; + cpu_copy->flags |= PARSEC_DATA_FLAG_DISCARDED; + + /* release the reference that the host copy had on the data_t to break + * the circular reference. */ + PARSEC_OBJ_RELEASE(data); } - /* third: release the reference that the host copy had on the data_t to break - * the circular reference. */ + /* unlock before releasing our references */ + parsec_atomic_unlock( &data->lock ); + + /* release the reference the application held */ PARSEC_OBJ_RELEASE(data); /* From here, any device copy that is still attached to the data_t From ae94def8b40246f88cce1f237a04bfb5afcbe1da Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 20 Nov 2024 17:56:47 -0500 Subject: [PATCH 6/8] Don't warn about owned discarded data Discarded data may never be pushed back so don't warn about it still being owned by the device. Signed-off-by: Joseph Schuchart --- parsec/mca/device/device_gpu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index da979d014..e2be96cc4 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -755,14 +755,16 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de while(NULL != (item = parsec_list_pop_front(list)) ) { parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item; parsec_data_t* original = gpu_copy->original; + parsec_data_copy_t *cpu_copy = original->device_copies[0]; PARSEC_DEBUG_VERBOSE(35, parsec_gpu_output_stream, "GPU[%d:%s] Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p, in map %p", gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count, original, (NULL != original ? original->dc : NULL)); assert( gpu_copy->device_index == gpu_device->super.device_index ); - - if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state ) { + /* warn about device data that has not been pushed back to the host or was discarded */ + if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state + && (NULL == cpu_copy || 0 == (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED)) ) { parsec_warning("GPU[%d:%s] still OWNS the master memory copy for data %d and it is discarding it!", gpu_device->super.device_index, gpu_device->super.name, original->key); } From 53b7721a87628c849c661f43ab4955e0e6d5e7a4 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 22 Nov 2024 09:20:07 -0500 Subject: [PATCH 7/8] Walk forward and backward when creating a w2r task Discarded data sit toward the end of the lru while the data to be evicted is at the front. We walk both forward and backward to collect the discarded data from the back, until we either meet the pivot or we found enough data to evict. If we discarded data we don't evict. Signed-off-by: Joseph Schuchart --- .../mca/device/cuda/device_cuda_component.c | 3 + parsec/mca/device/device_gpu.h | 1 + .../level_zero/device_level_zero_component.c | 3 + parsec/mca/device/transfer_gpu.c | 133 ++++++++++++------ 4 files changed, 99 insertions(+), 41 deletions(-) diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c index 3529917b0..fd9788f94 100644 --- a/parsec/mca/device/cuda/device_cuda_component.c +++ b/parsec/mca/device/cuda/device_cuda_component.c @@ -161,6 +161,9 @@ static int device_cuda_component_register(void) (void)parsec_mca_param_reg_int_name("device_cuda", "max_number_of_ejected_data", "Sets up the maximum number of blocks that can be ejected from GPU memory", false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_flows); + (void)parsec_mca_param_reg_int_name("device_cuda", "max_number_of_discarded_data", + "Sets up the maximum number of discarded blocks to be collected at once", + false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_discarded); (void)parsec_mca_param_reg_int_name("device_cuda", "max_streams", "Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3", false, false, PARSEC_GPU_MAX_STREAMS, &parsec_cuda_max_streams); diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h index fa8c3517f..74cab4e65 100644 --- a/parsec/mca/device/device_gpu.h +++ b/parsec/mca/device/device_gpu.h @@ -279,6 +279,7 @@ typedef struct parsec_gpu_workspace_s { PARSEC_DECLSPEC extern int parsec_gpu_output_stream; PARSEC_DECLSPEC extern int parsec_gpu_verbosity; PARSEC_DECLSPEC extern int32_t parsec_gpu_d2h_max_flows; +PARSEC_DECLSPEC extern int32_t parsec_gpu_d2h_max_discarded; /** * Debugging functions. diff --git a/parsec/mca/device/level_zero/device_level_zero_component.c b/parsec/mca/device/level_zero/device_level_zero_component.c index f50f2a817..4407cfa31 100644 --- a/parsec/mca/device/level_zero/device_level_zero_component.c +++ b/parsec/mca/device/level_zero/device_level_zero_component.c @@ -271,6 +271,9 @@ static int device_level_zero_component_register(void) (void)parsec_mca_param_reg_int_name("device_level_zero", "max_number_of_ejected_data", "Sets up the maximum number of blocks that can be ejected from GPU memory", false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_flows); + (void)parsec_mca_param_reg_int_name("device_level_zero", "max_number_of_discarded_data", + "Sets up the maximum number of discarded blocks to be collected at once", + false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_discarded); (void)parsec_mca_param_reg_int_name("device_level_zero", "max_streams", "Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3", false, false, PARSEC_GPU_MAX_STREAMS, &parsec_level_zero_max_streams); diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c index 8f9c3596b..f6bb45082 100644 --- a/parsec/mca/device/transfer_gpu.c +++ b/parsec/mca/device/transfer_gpu.c @@ -179,6 +179,7 @@ static const parsec_symbol_t symb_gpu_d2h_task_param = { }; int32_t parsec_gpu_d2h_max_flows = 0; +int32_t parsec_gpu_d2h_max_discarded = 0; static const parsec_task_class_t parsec_gpu_d2h_task_class = { .name = "GPU D2H data transfer", @@ -215,6 +216,16 @@ static const parsec_task_class_t parsec_gpu_d2h_task_class = { #endif }; +static inline void release_discarded_data(parsec_device_gpu_module_t *gpu_device, parsec_gpu_data_copy_t* gpu_copy) +{ + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(gpu_copy); + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, + "D2H[%d:%s] GPU data copy %p of discarded data %p will be released", + gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original); + parsec_device_release_gpu_copy(gpu_device, gpu_copy); + +} /** * Transfer at most the MAX_PARAM_COUNT oldest data from the GPU back @@ -227,58 +238,98 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, { parsec_gpu_task_t *w2r_task = NULL; parsec_gpu_d2h_task_t *d2h_task = NULL; - parsec_gpu_data_copy_t *gpu_copy, *cpu_copy; - parsec_list_item_t* item = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next; + parsec_gpu_data_copy_t *fwd_gpu_copy = NULL, *fwd_cpu_copy = NULL, *rev_gpu_copy = NULL, *rev_cpu_copy = NULL; + parsec_list_item_t* fwd = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next; + parsec_list_item_t* rev = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_prev; int nb_cleaned = 0; + int nb_discarded = 0; + int nb_candidates = 0; + const int max_flows = (parsec_gpu_d2h_max_flows < MAX_PARAM_COUNT) ? parsec_gpu_d2h_max_flows : MAX_PARAM_COUNT; + /* store candidates in an array without unlinking them so we can easily abandon them */ + parsec_gpu_data_copy_t *candidates[MAX_PARAM_COUNT]; /* Find a data copy that has no pending users on the GPU, and can be - * safely moved back on the main memory */ - while(nb_cleaned < parsec_gpu_d2h_max_flows) { + * safely moved back on the main memory. + * Also look for data that was discarded and can be released immediatly. + * + * Observation: data to be evicted is more likely at the front of the list + * while data that is discarded is more likely at the end + * (since it was likely discarded shortly after being used) + * so we search from the front and the back. */ + while(nb_candidates < max_flows && + /* allow discarding to be disabled */ + (parsec_gpu_d2h_max_discarded == 0 || nb_discarded < parsec_gpu_d2h_max_discarded)) { /* Break at the end of the list */ - if( item == &(gpu_device->gpu_mem_owned_lru.ghost_element) ) { + if( fwd == &gpu_device->gpu_mem_owned_lru.ghost_element ) { break; } - gpu_copy = (parsec_gpu_data_copy_t*)item; - cpu_copy = gpu_copy->original->device_copies[0]; - parsec_atomic_lock( &gpu_copy->original->lock ); - /* get the next item before altering the next pointer */ - item = (parsec_list_item_t*)item->list_next; /* conversion needed for volatile */ - if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { - parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); - PARSEC_LIST_ITEM_SINGLETON(gpu_copy); - PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, - "D2H[%d:%s] GPU data copy %p of discarded data %p will be released", - gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original); - parsec_atomic_unlock( &gpu_copy->original->lock ); - parsec_device_release_gpu_copy(gpu_device, gpu_copy); - } else if( 0 == gpu_copy->readers ) { - if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* allocate on-demand */ - d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool); - if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* we're running out of memory. Bail out. */ - parsec_atomic_unlock( &gpu_copy->original->lock ); - return NULL; - } - PARSEC_OBJ_CONSTRUCT(d2h_task, parsec_task_t); - } - parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); - PARSEC_LIST_ITEM_SINGLETON(gpu_copy); - gpu_copy->readers++; - d2h_task->data[nb_cleaned].data_out = gpu_copy; - gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER; /* mark the copy as in transfer */ - parsec_atomic_unlock( &gpu_copy->original->lock ); - PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "D2H[%d:%s] task %p:\tdata %d -> %p [%p] readers %d", - gpu_device->super.device_index, gpu_device->super.name, (void*)d2h_task, - nb_cleaned, gpu_copy, gpu_copy->original, gpu_copy->readers); - nb_cleaned++; - if (MAX_PARAM_COUNT == nb_cleaned) + if (fwd == rev || fwd->list_next == rev) { + /* break at median if we discarded data */ + if (nb_discarded > 0) { break; - } else { - parsec_atomic_unlock( &gpu_copy->original->lock ); + } + /* otherwise stop walking backwards because we already + * looked for discarded data on the way */ + rev = NULL; + rev_gpu_copy = NULL; + rev_cpu_copy = NULL; } + + fwd_gpu_copy = (parsec_gpu_data_copy_t*)fwd; + fwd_cpu_copy = fwd_gpu_copy->original->device_copies[0]; + /* get the next item before altering the next pointer */ + fwd = (parsec_list_item_t*)fwd->list_next; /* conversion needed for volatile */ + if (NULL != rev) { + rev_gpu_copy = (parsec_gpu_data_copy_t*)rev; + rev_cpu_copy = rev_gpu_copy->original->device_copies[0]; + rev = (parsec_list_item_t*)rev->list_prev; // cast for volatile + } + if (parsec_gpu_d2h_max_discarded && fwd_cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + release_discarded_data(gpu_device, fwd_gpu_copy); + ++nb_discarded; + } else if( max_flows > nb_candidates && 0 == fwd_gpu_copy->readers ) { + /* store the candidates but leave them in the LRU */ + candidates[nb_candidates] = fwd_gpu_copy; + nb_candidates++; + } + if (parsec_gpu_d2h_max_discarded && + NULL != rev_cpu_copy && + rev_cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + release_discarded_data(gpu_device, rev_gpu_copy); + ++nb_discarded; + } + } + + if( nb_discarded > 0 || nb_candidates == 0 ) { + /* we discarded some data, don't bother pushing out */ + return NULL; } - if( 0 == nb_cleaned ) + d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool); + if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* we're running out of memory. Bail out. */ return NULL; + } + PARSEC_OBJ_CONSTRUCT(d2h_task, parsec_task_t); + + for (int i = 0; i < nb_candidates; ++i) { + parsec_gpu_data_copy_t *gpu_copy = candidates[i]; + parsec_atomic_lock( &gpu_copy->original->lock ); + if (PARSEC_UNLIKELY(gpu_copy->readers != 0)) { + /* gained a reader, ignore */ + parsec_atomic_unlock( &gpu_copy->original->lock ); + continue; + } + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(gpu_copy); + gpu_copy->readers++; + d2h_task->data[nb_cleaned].data_out = gpu_copy; + gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER; /* mark the copy as in transfer */ + parsec_atomic_unlock( &gpu_copy->original->lock ); + nb_cleaned++; + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "D2H[%d:%s] task %p:\tdata %d -> %p [%p] readers %d", + gpu_device->super.device_index, gpu_device->super.name, (void*)d2h_task, + nb_cleaned, gpu_copy, gpu_copy->original, gpu_copy->readers); + } d2h_task->priority = INT32_MAX; d2h_task->task_class = &parsec_gpu_d2h_task_class; From 9c7b42b7e502f9dc0fd3eff1b1c4e82b09ae52de Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 13 Feb 2025 17:47:19 -0500 Subject: [PATCH 8/8] Track the number of discarded copies per device We only try to find discarded data if we know that there is discarded data. If no one discarded data (e.g., DPLASMA) we don't go look for it. This is also needed to properly clean up discarded data before releasing the zone allocator. Signed-off-by: Joseph Schuchart --- parsec/data.c | 15 ++++ parsec/mca/device/device.h | 1 + parsec/mca/device/device_gpu.c | 65 +++++++++++++++- parsec/mca/device/transfer_gpu.c | 124 +++++++++++-------------------- 4 files changed, 119 insertions(+), 86 deletions(-) diff --git a/parsec/data.c b/parsec/data.c index f81b32256..d1286979d 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -592,6 +592,21 @@ parsec_data_discard( parsec_data_t *data ) PARSEC_OBJ_RELEASE(data); } + /** + * Tell the devices that they have discarded data. + */ + for (uint32_t i = 1; i < parsec_nb_devices; i++) { + if (parsec_mca_device_is_gpu(i)) { + parsec_data_copy_t *device_copy = data->device_copies[i]; + if (NULL != device_copy) { + parsec_device_module_t* device = parsec_mca_device_get(i); + if (NULL != device) { + parsec_atomic_fetch_inc_int64(&device->nb_discarded); + } + } + } + } + /* unlock before releasing our references */ parsec_atomic_unlock( &data->lock ); diff --git a/parsec/mca/device/device.h b/parsec/mca/device/device.h index 48ee9eab0..3c4201fea 100644 --- a/parsec/mca/device/device.h +++ b/parsec/mca/device/device.h @@ -166,6 +166,7 @@ struct parsec_device_module_s { uint64_t executed_tasks; uint64_t nb_data_faults; uint64_t nb_evictions; + volatile int64_t nb_discarded; /**< Track number of discarded data copies on this device */ /* We provide the compute capacity of the device in GFlop/s so that conversion to #nanosec in load estimates is straightforward */ /* These compute capacities can be useful for users when providing their own * time_estimate functions: the user can divide the number of flops for the diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index e2be96cc4..ae45c7e49 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -747,6 +747,49 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device, return PARSEC_SUCCESS; } +/** + * Release discarded data copies from the LRU list. + * Returns the number of discarded items released. + */ +static int parsec_device_memory_release_discarded(parsec_device_gpu_module_t* gpu_device, + parsec_list_t* list) +{ + parsec_list_item_t* item; + parsec_list_item_t* ring = NULL; + int count = 0; + + if (gpu_device->super.nb_discarded == 0) { + return 0; + } + + while (NULL != (item = parsec_list_pop_front(list))) { + parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item; + parsec_data_t* original = gpu_copy->original; + if (NULL != original) { + parsec_data_copy_t *cpu_copy = original->device_copies[0]; + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(item); + + if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + count++; + parsec_device_release_gpu_copy(gpu_device, gpu_copy); + PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream, + "Releasing discarded GPU copy %p from data %p", gpu_copy, original); + } else if (ring == NULL) { + ring = item; + } else { + parsec_list_item_ring_push(ring, item); + } + } + } + /* put the ring back into the list */ + if (NULL != ring) { + parsec_list_chain_front(list, ring); + } + parsec_atomic_fetch_sub_int64(&gpu_device->super.nb_discarded, count); + return count; +} + static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_device, parsec_list_t* list) { @@ -755,7 +798,6 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de while(NULL != (item = parsec_list_pop_front(list)) ) { parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item; parsec_data_t* original = gpu_copy->original; - parsec_data_copy_t *cpu_copy = original->device_copies[0]; PARSEC_DEBUG_VERBOSE(35, parsec_gpu_output_stream, "GPU[%d:%s] Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p, in map %p", @@ -763,8 +805,7 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de original, (NULL != original ? original->dc : NULL)); assert( gpu_copy->device_index == gpu_device->super.device_index ); /* warn about device data that has not been pushed back to the host or was discarded */ - if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state - && (NULL == cpu_copy || 0 == (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED)) ) { + if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state) { parsec_warning("GPU[%d:%s] still OWNS the master memory copy for data %d and it is discarding it!", gpu_device->super.device_index, gpu_device->super.name, original->key); } @@ -793,7 +834,11 @@ parsec_device_flush_lru( parsec_device_module_t *device ) { size_t in_use; parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device; - /* Free all memory on GPU */ + /* Remove discarded data copies */ + parsec_device_memory_release_discarded(gpu_device, &gpu_device->gpu_mem_lru); + parsec_device_memory_release_discarded(gpu_device, &gpu_device->gpu_mem_owned_lru); + assert(gpu_device->super.nb_discarded == 0); + /* Free all remaining memory on GPU */ parsec_device_memory_release_list(gpu_device, &gpu_device->gpu_mem_lru); parsec_device_memory_release_list(gpu_device, &gpu_device->gpu_mem_owned_lru); parsec_device_free_workspace(gpu_device); @@ -956,6 +1001,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, #endif parsec_atomic_unlock(&master->lock); return PARSEC_HOOK_RETURN_NEXT; + } else if (NULL != lru_gpu_elem->original) { + /* account for discarded data */ + parsec_data_copy_t* cpu_copy = lru_gpu_elem->original->device_copies[0]; + if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + parsec_atomic_fetch_dec_int64(&gpu_device->super.nb_discarded); + } } PARSEC_LIST_ITEM_SINGLETON(lru_gpu_elem); @@ -2345,6 +2396,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device, PARSEC_LIST_ITEM_SINGLETON(gpu_copy); /* release the original and */ parsec_device_release_gpu_copy(gpu_device, gpu_copy); + parsec_atomic_fetch_dec_int64(&gpu_device->super.nb_discarded); } else if( gpu_task->pushout & (1 << i)) { PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU copy %p [ref_count %d] moved to the read LRU in %s", @@ -2531,6 +2583,11 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, } assert(NULL == progress_task); + /* try to release all discarded copies and try again if succesful */ + if (0 < parsec_device_memory_release_discarded(gpu_device, &gpu_device->gpu_mem_owned_lru)) { + goto check_in_deps; + } + /* TODO: check this */ /* If we can extract data go for it, otherwise try to drain the pending tasks */ gpu_task = parsec_gpu_create_w2r_task(gpu_device, es); diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c index f6bb45082..fc639d144 100644 --- a/parsec/mca/device/transfer_gpu.c +++ b/parsec/mca/device/transfer_gpu.c @@ -238,98 +238,58 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, { parsec_gpu_task_t *w2r_task = NULL; parsec_gpu_d2h_task_t *d2h_task = NULL; - parsec_gpu_data_copy_t *fwd_gpu_copy = NULL, *fwd_cpu_copy = NULL, *rev_gpu_copy = NULL, *rev_cpu_copy = NULL; - parsec_list_item_t* fwd = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next; - parsec_list_item_t* rev = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_prev; + parsec_gpu_data_copy_t *gpu_copy, *cpu_copy; + parsec_list_item_t* item = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next; int nb_cleaned = 0; - int nb_discarded = 0; - int nb_candidates = 0; - const int max_flows = (parsec_gpu_d2h_max_flows < MAX_PARAM_COUNT) ? parsec_gpu_d2h_max_flows : MAX_PARAM_COUNT; - /* store candidates in an array without unlinking them so we can easily abandon them */ - parsec_gpu_data_copy_t *candidates[MAX_PARAM_COUNT]; /* Find a data copy that has no pending users on the GPU, and can be - * safely moved back on the main memory. - * Also look for data that was discarded and can be released immediatly. - * - * Observation: data to be evicted is more likely at the front of the list - * while data that is discarded is more likely at the end - * (since it was likely discarded shortly after being used) - * so we search from the front and the back. */ - while(nb_candidates < max_flows && - /* allow discarding to be disabled */ - (parsec_gpu_d2h_max_discarded == 0 || nb_discarded < parsec_gpu_d2h_max_discarded)) { + * safely moved back on the main memory */ + while(nb_cleaned < parsec_gpu_d2h_max_flows) { /* Break at the end of the list */ - if( fwd == &gpu_device->gpu_mem_owned_lru.ghost_element ) { + if( item == &(gpu_device->gpu_mem_owned_lru.ghost_element) ) { break; } - if (fwd == rev || fwd->list_next == rev) { - /* break at median if we discarded data */ - if (nb_discarded > 0) { - break; - } - /* otherwise stop walking backwards because we already - * looked for discarded data on the way */ - rev = NULL; - rev_gpu_copy = NULL; - rev_cpu_copy = NULL; - } - - fwd_gpu_copy = (parsec_gpu_data_copy_t*)fwd; - fwd_cpu_copy = fwd_gpu_copy->original->device_copies[0]; + gpu_copy = (parsec_gpu_data_copy_t*)item; + cpu_copy = gpu_copy->original->device_copies[0]; + parsec_atomic_lock( &gpu_copy->original->lock ); /* get the next item before altering the next pointer */ - fwd = (parsec_list_item_t*)fwd->list_next; /* conversion needed for volatile */ - if (NULL != rev) { - rev_gpu_copy = (parsec_gpu_data_copy_t*)rev; - rev_cpu_copy = rev_gpu_copy->original->device_copies[0]; - rev = (parsec_list_item_t*)rev->list_prev; // cast for volatile - } - if (parsec_gpu_d2h_max_discarded && fwd_cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { - release_discarded_data(gpu_device, fwd_gpu_copy); - ++nb_discarded; - } else if( max_flows > nb_candidates && 0 == fwd_gpu_copy->readers ) { - /* store the candidates but leave them in the LRU */ - candidates[nb_candidates] = fwd_gpu_copy; - nb_candidates++; - } - if (parsec_gpu_d2h_max_discarded && - NULL != rev_cpu_copy && - rev_cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { - release_discarded_data(gpu_device, rev_gpu_copy); - ++nb_discarded; + item = (parsec_list_item_t*)item->list_next; /* conversion needed for volatile */ + if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(gpu_copy); + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, + "D2H[%d:%s] GPU data copy %p of discarded data %p will be released", + gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original); + parsec_atomic_unlock( &gpu_copy->original->lock ); + parsec_device_release_gpu_copy(gpu_device, gpu_copy); + } else if( 0 == gpu_copy->readers ) { + if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* allocate on-demand */ + d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool); + if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* we're running out of memory. Bail out. */ + parsec_atomic_unlock( &gpu_copy->original->lock ); + return NULL; + } + PARSEC_OBJ_CONSTRUCT(d2h_task, parsec_task_t); + } + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(gpu_copy); + gpu_copy->readers++; + d2h_task->data[nb_cleaned].data_out = gpu_copy; + gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER; /* mark the copy as in transfer */ + parsec_atomic_unlock( &gpu_copy->original->lock ); + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "D2H[%d:%s] task %p:\tdata %d -> %p [%p] readers %d", + gpu_device->super.device_index, gpu_device->super.name, (void*)d2h_task, + nb_cleaned, gpu_copy, gpu_copy->original, gpu_copy->readers); + nb_cleaned++; + if (MAX_PARAM_COUNT == nb_cleaned) + break; + } else { + parsec_atomic_unlock( &gpu_copy->original->lock ); } } - if( nb_discarded > 0 || nb_candidates == 0 ) { - /* we discarded some data, don't bother pushing out */ - return NULL; - } - - d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool); - if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* we're running out of memory. Bail out. */ + if( 0 == nb_cleaned ) return NULL; - } - PARSEC_OBJ_CONSTRUCT(d2h_task, parsec_task_t); - - for (int i = 0; i < nb_candidates; ++i) { - parsec_gpu_data_copy_t *gpu_copy = candidates[i]; - parsec_atomic_lock( &gpu_copy->original->lock ); - if (PARSEC_UNLIKELY(gpu_copy->readers != 0)) { - /* gained a reader, ignore */ - parsec_atomic_unlock( &gpu_copy->original->lock ); - continue; - } - parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); - PARSEC_LIST_ITEM_SINGLETON(gpu_copy); - gpu_copy->readers++; - d2h_task->data[nb_cleaned].data_out = gpu_copy; - gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER; /* mark the copy as in transfer */ - parsec_atomic_unlock( &gpu_copy->original->lock ); - nb_cleaned++; - PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "D2H[%d:%s] task %p:\tdata %d -> %p [%p] readers %d", - gpu_device->super.device_index, gpu_device->super.name, (void*)d2h_task, - nb_cleaned, gpu_copy, gpu_copy->original, gpu_copy->readers); - } d2h_task->priority = INT32_MAX; d2h_task->task_class = &parsec_gpu_d2h_task_class; @@ -401,4 +361,4 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device, free(gpu_task); gpu_device->data_avail_epoch++; return 0; -} +} \ No newline at end of file