diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index d57ac904e..a661c9a01 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -39,6 +39,13 @@ static int parsec_gpu_profiling_initiated = 0; int parsec_gpu_output_stream = -1; int parsec_gpu_verbosity; +/* The return value of these functions is either a parsec_hook_return_t for <= 0 values, + * or a positive number which represents that something has been scheduled on the gpu_stream + */ +typedef int(*parsec_gpu_step_function_t)(parsec_device_gpu_module_t *gpu_device, + parsec_gpu_task_t *gpu_task, + parsec_gpu_exec_stream_t *gpu_stream); + static inline int parsec_device_check_space_needed(parsec_device_gpu_module_t *gpu_device, parsec_gpu_task_t *gpu_task) @@ -1461,7 +1468,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]:\tThere is a potential alternative source for data_in %p [ref_count %d] in original %p to go in copy %p [ref_count %d], but it is not ready, falling back on CPU source", gpu_device->super.device_index, gpu_device->super.name, task_data->data_in, task_data->data_in->super.super.obj_reference_count, original, gpu_elem, gpu_elem->super.super.obj_reference_count); - //return PARSEC_HOOK_RETURN_NEXT; + //return PARSEC_HOOK_RETURN_AGAIN; } /* We fall back on the CPU copy */ @@ -1878,7 +1885,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device, static inline int parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, parsec_gpu_exec_stream_t* stream, - parsec_advance_task_function_t progress_fct, + parsec_gpu_step_function_t progress_fct, parsec_gpu_task_t* task, parsec_gpu_task_t** out_task ) { @@ -1937,6 +1944,7 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, } grab_a_task: + assert(NULL == task); if( NULL == stream->tasks[stream->start] ) { /* there is room on the stream */ task = (parsec_gpu_task_t*)parsec_list_pop_front(stream->fifo_pending); /* get the best task */ } @@ -1949,6 +1957,20 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, schedule_task: rc = progress_fct( gpu_device, task, stream ); + if(0 == rc) { +#if defined(PARSEC_PROF_TRACE) + if( stream->prof_event_track_enable ) { + if( task->prof_key_end != -1 ) { + PARSEC_PROFILING_TRACE(stream->profiling, task->prof_key_end, task->prof_event_id, task->prof_tp_id, NULL); + } + } +#endif + /* If progress_fct added nothing on that stream, we skip scheduling a record on the GPU stream */ + if( task->complete_stage ) + rc = task->complete_stage(gpu_device, &task, stream); + *out_task = task; + return rc; + } if( 0 > rc ) { if( PARSEC_HOOK_RETURN_AGAIN != rc ) { if( PARSEC_HOOK_RETURN_NEXT == rc ) { @@ -1978,7 +2000,6 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, "trigger the task will be handled accordingly", gpu_device->super.device_index, gpu_device->super.name, (void*)task); } - task->last_status = rc; /** * Do not skip the gpu event generation. The problem is that some of the inputs * might be in the pipe of being transferred to the GPU. If we activate this task @@ -2018,7 +2039,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device, { parsec_task_t *this_task = gpu_task->ec; const parsec_flow_t *flow; - int i, ret = 0; + int i, ret = 0, how_many = 0; #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; #endif @@ -2089,6 +2110,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device, gpu_task->last_status = ret; return ret; } + how_many += ret; } PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, @@ -2099,7 +2121,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device, #if defined(PARSEC_PROF_TRACE) gpu_task->prof_key_end = -1; /* We do not log that event as the completion of this task */ #endif - return PARSEC_HOOK_RETURN_DONE; + return how_many; } /** @@ -2116,6 +2138,7 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t *gpu_device, { parsec_advance_task_function_t progress_fct = gpu_task->submit; parsec_task_t* this_task = gpu_task->ec; + int rc; #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; @@ -2152,7 +2175,9 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t *gpu_device, #endif /* defined(PARSEC_DEBUG_PARANOID) */ (void)this_task; - return progress_fct( gpu_device, gpu_task, gpu_stream ); + rc = progress_fct( gpu_device, gpu_task, gpu_stream ); + gpu_task->last_status = rc; + return 1; } /** @@ -2193,8 +2218,9 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, return_code = PARSEC_HOOK_RETURN_DISABLE; goto release_and_return_error; } + how_many++; } - return PARSEC_HOOK_RETURN_DONE; + return how_many; } PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,