Skip to content

Commit b535ab2

Browse files
BrieucNicolasBrieuc Nicolas
authored andcommitted
co_manager shortcuting the scheduler
fix for multiple gpus Added debug output and some documentation, error tolerance for multiple gpu and no manager, refactored loop in insert_function, got rid of deadlock
1 parent 0eb20bd commit b535ab2

5 files changed

Lines changed: 276 additions & 25 deletions

File tree

parsec/interfaces/dtd/insert_function.c

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@
4040

4141
#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
4242
#include "parsec/mca/device/cuda/device_cuda.h"
43+
#include "parsec/sys/tls.h"
44+
45+
extern PARSEC_TLS_DECLARE(co_manager_tls);
46+
4347
#endif /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
4448

4549
#include "parsec/mca/mca_repository.h"
@@ -224,8 +228,8 @@ static int parsec_dtd_taskpool_leave_wait(parsec_taskpool_t* tp, void*_)
224228
parsec_termdet_open_module(tp, "local");
225229
tp->tdm.module->monitor_taskpool(tp, parsec_taskpool_termination_detected);
226230
tp->tdm.module->taskpool_set_nb_tasks(tp, 0);
227-
tp->tdm.module->taskpool_set_runtime_actions(tp, 0);
228-
231+
tp->tdm.module->taskpool_set_runtime_actions(tp, 0);
232+
229233
/* We are re-attached to the context */
230234
parsec_atomic_fetch_inc_int32(&tp->context->active_taskpools);
231235
return PARSEC_SUCCESS;
@@ -721,7 +725,7 @@ parsec_dtd_add_profiling_info(parsec_taskpool_t *tp,
721725
}
722726

723727
void
724-
parsec_dtd_add_profiling_info_generic(parsec_taskpool_t *tp,
728+
parsec_dtd_add_profiling_info_generic(parsec_taskpool_t *tp,
725729
const char *name,
726730
int *keyin, int *keyout)
727731
{
@@ -1002,7 +1006,7 @@ parsec_dtd_insert_task_class(parsec_dtd_taskpool_t *tp,
10021006
} else {
10031007
char *fc = fill_color(tc->super.task_class_id, PARSEC_DTD_NB_TASK_CLASSES);
10041008
parsec_profiling_add_dictionary_keyword(tc->super.name, fc,
1005-
sizeof(parsec_task_prof_info_t)+info_size,
1009+
sizeof(parsec_task_prof_info_t)+info_size,
10061010
info_str,
10071011
(int *)&PARSEC_PROF_FUNC_KEY_START(&tp->super, tc->super.task_class_id),
10081012
(int *)&PARSEC_PROF_FUNC_KEY_END(&tp->super, tc->super.task_class_id));
@@ -1789,9 +1793,63 @@ parsec_dtd_release_deps(parsec_execution_stream_t *es,
17891793

17901794
/* Scheduling tasks */
17911795
if( action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS ) {
1792-
__parsec_schedule_vp(es, arg.ready_lists, 0);
1793-
}
1796+
int nb_task_rings = es->virtual_process->parsec_context->nb_vp;
1797+
1798+
/* Iterating through the task rings */
1799+
for(int vp = 0; vp < nb_task_rings; vp++ ){
1800+
const parsec_vp_t** vps = (const parsec_vp_t**)es->virtual_process->parsec_context->virtual_processes;
1801+
parsec_execution_stream_t* target_es = vps[vp]->execution_streams[0];
1802+
#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
1803+
parsec_device_module_t** co_manager_tls_val = PARSEC_TLS_GET_SPECIFIC(co_manager_tls);
1804+
1805+
if( co_manager_tls_val != NULL ) {
1806+
/* I am the co-manager */
1807+
1808+
parsec_task_t* task_ring = arg.ready_lists[vp];
1809+
parsec_task_t* current_task = task_ring;
1810+
#if defined(PARSEC_DEBUG_NOISIER)
1811+
char tmp[MAX_TASK_STRLEN];
1812+
#endif
1813+
/* iterate through the single tasks */
1814+
while ( task_ring != NULL )
1815+
{
1816+
task_ring = (parsec_task_t*)parsec_list_item_ring_chop( &current_task->super );
1817+
parsec_list_item_singleton( (parsec_list_item_t*)current_task );
1818+
1819+
if (PARSEC_DTD_FLUSH_TC_ID == current_task->task_class->task_class_id)
1820+
{
1821+
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,"GPU[%s]: Thread %d scheduling task %s at %s:%d",
1822+
((parsec_device_module_t*)*co_manager_tls_val)->name, es->th_id,
1823+
parsec_task_snprintf(tmp, MAX_TASK_STRLEN, current_task), __FILE__, __LINE__);
1824+
__parsec_schedule(target_es, current_task, 0);
1825+
}
1826+
else
1827+
{
1828+
/* try to skip the scheduler */
1829+
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,"GPU[%s]: Thread %d try executing task %s %p at %s:%d",
1830+
((parsec_device_module_t*)*co_manager_tls_val)->name, es->th_id,
1831+
parsec_task_snprintf(tmp, MAX_TASK_STRLEN, current_task), current_task, __FILE__, __LINE__);
1832+
int rc = __parsec_execute(target_es, current_task);
1833+
if( rc != PARSEC_HOOK_RETURN_ASYNC ){
1834+
/* failed to shortcut, scheduling normally */
1835+
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,"GPU[%s]: Thread %d resort to scheduling task %s %p at %s:%d",
1836+
((parsec_device_module_t*)*co_manager_tls_val)->name, es->th_id,
1837+
parsec_task_snprintf(tmp, MAX_TASK_STRLEN, current_task), current_task, __FILE__, __LINE__);
1838+
__parsec_schedule(target_es, current_task, 0);
1839+
}
17941840

1841+
}
1842+
current_task = task_ring;
1843+
}
1844+
arg.ready_lists[vp] = NULL;
1845+
}
1846+
#endif /* PARSEC_HAVE_DEV_CUDA_SUPPORT */
1847+
if(arg.ready_lists[vp] != NULL ) {
1848+
__parsec_schedule(target_es, arg.ready_lists[vp], 0);
1849+
arg.ready_lists[vp] = NULL;
1850+
}
1851+
}
1852+
}
17951853
PARSEC_PINS(es, RELEASE_DEPS_END, this_task);
17961854
return 0;
17971855
}
@@ -2127,7 +2185,7 @@ parsec_dtd_create_task_classv(const char *name,
21272185
(flow_count * sizeof(parsec_dtd_descendant_info_t)) +
21282186
(flow_count * sizeof(parsec_dtd_flow_info_t)) +
21292187
(nb_params * sizeof(parsec_dtd_task_param_t)) +
2130-
total_size_of_param);
2188+
total_size_of_param);
21312189

21322190
parsec_mempool_construct(&dtd_tc->context_mempool,
21332191
PARSEC_OBJ_CLASS(parsec_dtd_task_t), total_size,

parsec/mca/device/cuda/device_cuda_component.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ char* parsec_cuda_lib_path = NULL;
4141

4242
static int cuda_mask, cuda_nvlink_mask;
4343

44+
int parsec_cuda_delegate_task_completion = 0;
4445

4546
/*
4647
* Instantiate the public struct with all of our public information
@@ -201,6 +202,10 @@ static int device_cuda_component_register(void)
201202
false, false, 0, &parsec_device_gpu_one_profiling_stream_per_gpu_stream);
202203
#endif
203204

205+
(void)parsec_mca_param_reg_int_name("device_cuda", "delegate_task_completion",
206+
"Integer to choose the whether task completion should be delegated to a co-manager thread (default is no)",
207+
false, false, 0, &parsec_cuda_delegate_task_completion);
208+
204209
/* If CUDA was not requested avoid initializing the devices */
205210
return (0 == parsec_device_cuda_enabled ? MCA_ERROR : MCA_SUCCESS);
206211
}

0 commit comments

Comments
 (0)