diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 30e34789f..a5e11c322 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -1227,6 +1227,7 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, if(flow_mask & (1U << i)){ source = task->data[i].data_out; dest = source->original->device_copies[0]; + if (NULL == dest) continue; dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index); src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index); diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c index 986cda05e..54dd14613 100644 --- a/parsec/mca/device/transfer_gpu.c +++ b/parsec/mca/device/transfer_gpu.c @@ -241,6 +241,25 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_atomic_lock( &gpu_copy->original->lock ); /* get the next item before altering the next pointer */ item = (parsec_list_item_t*)item->list_next; /* conversion needed for volatile */ + + if (gpu_copy->original->device_copies[0] == NULL) { + /* no host-side copy available, release immediately if no readers */ + if (gpu_copy->readers == 0) { + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, + "D2H[%d:%s] CPU data copy discarded, GPU data copy %p [%p] now available", + gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original); + + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(gpu_copy); + gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; + gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER; + parsec_list_push_back(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_copy); + gpu_device->data_avail_epoch++; + } + parsec_atomic_unlock( &gpu_copy->original->lock ); + continue; + } + if( 0 == gpu_copy->readers ) { if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* allocate on-demand */ d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool); @@ -284,6 +303,8 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, w2r_task->stage_out = &parsec_default_gpu_stage_out; w2r_task->complete_stage = NULL; + parsec_atomic_fetch_inc_int32( &(gpu_device->mutex) ); + (void)es; return w2r_task; } @@ -314,20 +335,22 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device, cpu_copy = original->device_copies[0]; - if( cpu_copy->version < gpu_copy->version ) { + if( NULL != cpu_copy && cpu_copy->version < gpu_copy->version ) { /* the GPU version has been acquired by a new task that is waiting for submission */ PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "D2H[%d:%s] task %p:%i GPU data copy %p [%p] has a backup in memory", gpu_device->super.device_index, gpu_device->super.name, (void*)task, i, gpu_copy, gpu_copy->original); } else { gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; - cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; - cpu_copy->version = gpu_copy->version; - PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, - "D2H[%d:%s]: CPU copy %p gets the same version %d as GPU copy %p at %s:%d", - gpu_device->super.device_index, gpu_device->super.name, - cpu_copy, cpu_copy->version, gpu_copy, - __FILE__, __LINE__); + if (NULL != cpu_copy) { + cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; + cpu_copy->version = gpu_copy->version; + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, + "D2H[%d:%s]: CPU copy %p gets the same version %d as GPU copy %p at %s:%d", + gpu_device->super.device_index, gpu_device->super.name, + cpu_copy, cpu_copy->version, gpu_copy, + __FILE__, __LINE__); + } PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "D2H[%d:%s] task %p:%i GPU data copy %p [%p] now available", gpu_device->super.device_index, gpu_device->super.name, (void*)task, i, gpu_copy, gpu_copy->original); @@ -338,5 +361,6 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_thread_mempool_free(es->context_mempool, task); free(gpu_task); gpu_device->data_avail_epoch++; + parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) ); return 0; }