diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c
index bc0a22adc..a8790108e 100644
--- a/parsec/interfaces/dtd/insert_function.c
+++ b/parsec/interfaces/dtd/insert_function.c
@@ -40,6 +40,10 @@
 
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
 #include "parsec/mca/device/cuda/device_cuda.h"
+#include "parsec/sys/tls.h"
+
+extern PARSEC_TLS_DECLARE(co_manager_tls);
+
 #endif  /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
 
 #include "parsec/mca/mca_repository.h"
@@ -224,8 +228,8 @@ static int parsec_dtd_taskpool_leave_wait(parsec_taskpool_t* tp, void*_)
     parsec_termdet_open_module(tp, "local");
     tp->tdm.module->monitor_taskpool(tp, parsec_taskpool_termination_detected);
     tp->tdm.module->taskpool_set_nb_tasks(tp, 0);
-    tp->tdm.module->taskpool_set_runtime_actions(tp, 0); 
-    
+    tp->tdm.module->taskpool_set_runtime_actions(tp, 0);
+
     /* We are re-attached to the context */
     parsec_atomic_fetch_inc_int32(&tp->context->active_taskpools);
     return PARSEC_SUCCESS;
@@ -721,7 +725,7 @@ parsec_dtd_add_profiling_info(parsec_taskpool_t *tp,
 }
 
 void
-parsec_dtd_add_profiling_info_generic(parsec_taskpool_t *tp, 
+parsec_dtd_add_profiling_info_generic(parsec_taskpool_t *tp,
                                       const char *name,
                                       int *keyin, int *keyout)
 {
@@ -1002,7 +1006,7 @@ parsec_dtd_insert_task_class(parsec_dtd_taskpool_t *tp,
     } else {
         char *fc = fill_color(tc->super.task_class_id, PARSEC_DTD_NB_TASK_CLASSES);
         parsec_profiling_add_dictionary_keyword(tc->super.name, fc,
-                                                sizeof(parsec_task_prof_info_t)+info_size, 
+                                                sizeof(parsec_task_prof_info_t)+info_size,
                                                 info_str,
                                                 (int *)&PARSEC_PROF_FUNC_KEY_START(&tp->super, tc->super.task_class_id),
                                                 (int *)&PARSEC_PROF_FUNC_KEY_END(&tp->super, tc->super.task_class_id));
@@ -1789,9 +1793,63 @@ parsec_dtd_release_deps(parsec_execution_stream_t *es,
 
     /* Scheduling tasks */
     if( action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS ) {
-        __parsec_schedule_vp(es, arg.ready_lists, 0);
-    }
+        int nb_task_rings = es->virtual_process->parsec_context->nb_vp;
+
+        /* Iterating through the task rings */
+        for(int vp = 0; vp < nb_task_rings; vp++ ){
+            const parsec_vp_t** vps = (const parsec_vp_t**)es->virtual_process->parsec_context->virtual_processes;
+            parsec_execution_stream_t* target_es = vps[vp]->execution_streams[0];
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+            parsec_device_module_t** co_manager_tls_val = PARSEC_TLS_GET_SPECIFIC(co_manager_tls);
+
+            if( co_manager_tls_val != NULL ) {
+            /* I am the co-manager */
+
+                parsec_task_t* task_ring = arg.ready_lists[vp];
+                parsec_task_t* current_task = task_ring;
+#if defined(PARSEC_DEBUG_NOISIER)
+                char tmp[MAX_TASK_STRLEN];
+#endif
+                /* iterate through the single tasks */
+                while ( task_ring != NULL )
+                {
+                    task_ring = (parsec_task_t*)parsec_list_item_ring_chop( &current_task->super );
+                    parsec_list_item_singleton( (parsec_list_item_t*)current_task );
+
+                    if (PARSEC_DTD_FLUSH_TC_ID == current_task->task_class->task_class_id)
+                    {
+                        PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,"GPU[%s]: Thread %d scheduling task %s at %s:%d",
+                                            ((parsec_device_module_t*)*co_manager_tls_val)->name, es->th_id,
+                                            parsec_task_snprintf(tmp, MAX_TASK_STRLEN, current_task), __FILE__, __LINE__);
+                        __parsec_schedule(target_es, current_task, 0);
+                    }
+                    else
+                    {
+                        /* try to skip the scheduler */
+                        PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,"GPU[%s]: Thread %d try executing task %s %p at %s:%d",
+                                                ((parsec_device_module_t*)*co_manager_tls_val)->name, es->th_id,
+                                                parsec_task_snprintf(tmp, MAX_TASK_STRLEN, current_task), current_task, __FILE__, __LINE__);
+                        int rc = __parsec_execute(target_es, current_task);
+                        if( rc != PARSEC_HOOK_RETURN_ASYNC ){
+                            /* failed to shortcut, scheduling normally */
+                            PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,"GPU[%s]: Thread %d resort to scheduling task %s %p at %s:%d",
+                                                    ((parsec_device_module_t*)*co_manager_tls_val)->name, es->th_id,
+                                                    parsec_task_snprintf(tmp, MAX_TASK_STRLEN, current_task), current_task, __FILE__, __LINE__);
+                            __parsec_schedule(target_es, current_task, 0);
+                        }
 
+                    }
+                    current_task = task_ring;
+                }
+                arg.ready_lists[vp] = NULL;
+            }
+#endif /* PARSEC_HAVE_DEV_CUDA_SUPPORT */
+            if(arg.ready_lists[vp] != NULL ) {
+                __parsec_schedule(target_es, arg.ready_lists[vp], 0);
+                arg.ready_lists[vp] = NULL;
+            }
+        }
+    }
     PARSEC_PINS(es, RELEASE_DEPS_END, this_task);
     return 0;
 }
@@ -2127,7 +2185,7 @@ parsec_dtd_create_task_classv(const char *name,
             (flow_count * sizeof(parsec_dtd_descendant_info_t)) +
             (flow_count * sizeof(parsec_dtd_flow_info_t)) +
             (nb_params * sizeof(parsec_dtd_task_param_t)) +
-            total_size_of_param); 
+            total_size_of_param);
 
     parsec_mempool_construct(&dtd_tc->context_mempool,
                              PARSEC_OBJ_CLASS(parsec_dtd_task_t), total_size,
diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c
index b2cb9ec4b..6e109f3dc 100644
--- a/parsec/mca/device/cuda/device_cuda_component.c
+++ b/parsec/mca/device/cuda/device_cuda_component.c
@@ -41,6 +41,7 @@ char* parsec_cuda_lib_path = NULL;
 
 static int cuda_mask, cuda_nvlink_mask;
 
+int parsec_cuda_delegate_task_completion = 0;
 
 /*
  * Instantiate the public struct with all of our public information
@@ -201,6 +202,10 @@ static int device_cuda_component_register(void)
                                         false, false, 0, &parsec_device_gpu_one_profiling_stream_per_gpu_stream);
 #endif
 
+    (void)parsec_mca_param_reg_int_name("device_cuda", "delegate_task_completion",
+                                        "Integer to choose the whether task completion should be delegated to a co-manager thread (default is no)",
+                                        false, false, 0, &parsec_cuda_delegate_task_completion);
+
     /* If CUDA was not requested avoid initializing the devices */
     return (0 == parsec_device_cuda_enabled ? MCA_ERROR : MCA_SUCCESS);
 }
diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c
index c3822e130..74b7db595 100644
--- a/parsec/mca/device/cuda/device_cuda_module.c
+++ b/parsec/mca/device/cuda/device_cuda_module.c
@@ -27,6 +27,10 @@
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 
+#include "parsec/sys/tls.h"
+
+PARSEC_TLS_DECLARE(co_manager_tls) = NULL;
+
 static int parsec_cuda_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int advice);
 /**
  * According to
@@ -45,6 +49,11 @@ parsec_cuda_memory_reserve( parsec_device_cuda_module_t* gpu_device,
 static int parsec_cuda_memory_release( parsec_device_cuda_module_t* gpu_device );
 static int parsec_cuda_flush_lru( parsec_device_module_t *device );
 
+/** MCA parameter that decides task delegation */
+extern int parsec_cuda_delegate_task_completion;
+parsec_hook_return_t
+parsec_cuda_co_manager( parsec_execution_stream_t *es, parsec_device_gpu_module_t* gpu_device );
+
 /* look up how many FMA per cycle in single/double, per cuda MP
  * precision.
  * The following table provides updated values for future archs
@@ -336,7 +345,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     double fp16, fp32, fp64, tf32;
     struct cudaDeviceProp prop;
 
-    show_caps_index = parsec_mca_param_find("device", NULL, "show_capabilities"); 
+    show_caps_index = parsec_mca_param_find("device", NULL, "show_capabilities");
     if(0 < show_caps_index) {
         parsec_mca_param_lookup_int(show_caps_index, &show_caps);
     }
@@ -366,6 +375,9 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     len = asprintf(&gpu_device->super.name, "cuda(%d)", dev_id);
     if(-1 == len) { gpu_device->super.name = NULL; goto release_device; }
     gpu_device->data_avail_epoch = 0;
+    gpu_device->mutex = 0;
+    gpu_device->complete_mutex = 0;
+    gpu_device->co_manager_mutex = 0;
 
     gpu_device->max_exec_streams = parsec_cuda_max_streams;
     gpu_device->exec_stream =
@@ -424,7 +436,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
         /* Each 'exec' stream gets its own profiling stream, except IN and OUT stream that share it.
          * It's good to separate the exec streams to know what was submitted to what stream
          * We don't have this issue for the IN and OUT streams because types of event discriminate
-         * what happens where, and separating them consumes memory and increases the number of 
+         * what happens where, and separating them consumes memory and increases the number of
          * events that needs to be matched between streams because we cannot differentiate some
          * ends between IN or OUT, so they are all logged on the same stream. */
         if(j == 0 || (parsec_device_gpu_one_profiling_stream_per_gpu_stream == 1 && j != 1))
@@ -477,6 +489,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_lru,       parsec_list_t);
     PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_owned_lru, parsec_list_t);
     PARSEC_OBJ_CONSTRUCT(&gpu_device->pending,           parsec_fifo_t);
+    PARSEC_OBJ_CONSTRUCT(&gpu_device->to_complete,       parsec_fifo_t);
 
     gpu_device->sort_starting_p = NULL;
     gpu_device->peer_access_mask = 0;  /* No GPU to GPU direct transfer by default */
@@ -537,7 +550,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
 #if defined(PARSEC_PROF_TRACE)
             if( NULL != exec_stream->profiling ) {
                 /* No function to clean the profiling stream. If one is introduced
-                 * some day, remember that exec streams 0 and 1 always share the same 
+                 * some day, remember that exec streams 0 and 1 always share the same
                  * ->profiling stream, and that all of them share the same
                  * ->profiling stream if parsec_device_cuda_one_profiling_stream_per_cuda_stream == 0 */
             }
@@ -569,6 +582,7 @@ parsec_cuda_module_fini(parsec_device_module_t* device)
 
     /* Release pending queue */
     PARSEC_OBJ_DESTRUCT(&gpu_device->pending);
+    PARSEC_OBJ_DESTRUCT(&gpu_device->to_complete);
 
     /* Release all streams */
     for( j = 0; j < gpu_device->num_exec_streams; j++ ) {
@@ -1303,7 +1317,7 @@ parsec_gpu_data_stage_in( parsec_device_cuda_module_t* cuda_device,
                              __FILE__, __LINE__);
     }
 
-    /* If data is from NEW (it doesn't have a source_repo_entry and is not a direct data collection reference), 
+    /* If data is from NEW (it doesn't have a source_repo_entry and is not a direct data collection reference),
      * and nobody has touched it yet, then we don't need to pull it in, we have created it already, that's enough. */
     /*
      * TODO: this test is not correct for anything but PTG
@@ -1710,7 +1724,7 @@ parsec_gpu_send_transfercomplete_cmd_to_device(parsec_data_copy_t *copy,
     gpu_task->stage_in  = parsec_default_cuda_stage_in;
     gpu_task->stage_out = parsec_default_cuda_stage_out;
     gpu_task->ec->data[0].data_in = copy;  /* We need to set not-null in data_in, so that the fake flow is
-                                            * not ignored when poping the data from the fake task */ 
+                                            * not ignored when poping the data from the fake task */
     gpu_task->ec->data[0].data_out = copy; /* We "free" data[i].data_out if its readers reaches 0 */
     gpu_task->ec->data[0].source_repo_entry = NULL;
     gpu_task->ec->data[0].source_repo = NULL;
@@ -2143,7 +2157,7 @@ parsec_cuda_kernel_push( parsec_device_gpu_module_t      *gpu_device,
  * setup the profiling information and then calls directly into the task submission body. Upon
  * return from the body handle the state machine of the task, taking care of the special cases
  * such as AGAIN and ASYNC.
- * @returns An error if anything unexpected came out of the task submission body, otherwise 
+ * @returns An error if anything unexpected came out of the task submission body, otherwise
  */
 static int
 parsec_cuda_kernel_exec( parsec_device_gpu_module_t      *gpu_device,
@@ -2552,8 +2566,9 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     parsec_device_gpu_module_t* gpu_device;
     parsec_device_cuda_module_t *cuda_device;
     cudaError_t status;
-    int rc, exec_stream = 0;
+    int rc, rc1, exec_stream = 0;
     parsec_gpu_task_t *progress_task, *out_task_submit = NULL, *out_task_pop = NULL;
+    int manager_completing_task  = 0;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
@@ -2564,6 +2579,8 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
 
     parsec_atomic_fetch_add_int64(&gpu_device->super.device_load, gpu_task->load);
 
+    parsec_device_module_t** co_manager_tls_val = PARSEC_TLS_GET_SPECIFIC(co_manager_tls);
+
 #if defined(PARSEC_PROF_TRACE)
     PARSEC_PROFILING_TRACE_FLAGS( es->es_profile,
                                   PARSEC_PROF_FUNC_KEY_END(gpu_task->ec->taskpool,
@@ -2579,12 +2596,15 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
      *   - rc == 0: there is no manager, and at the exit of the while, this thread
      *             made rc go from 0 to 1, so it is the new manager of the GPU and
      *             needs to deal with gpu_task
+     *             or
+     *             There is a manager, but it is waiting for the co_manager to release
+     *             new tasks, so we just append our task and don't become manager
      *   - rc > 0: there is a manager, and at the exit of the while, this thread has
      *             committed new work that the manager will need to do, but the work is
      *             not in the queue yet.
      */
     while(1) {
-        rc = gpu_device->mutex;
+        rc = rc1 = gpu_device->mutex;
         struct timespec delay;
         if( rc >= 0 ) {
             if( parsec_atomic_cas_int32( &gpu_device->mutex, rc, rc+1 ) ) {
@@ -2596,10 +2616,40 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
             nanosleep(&delay, NULL);
         }
     }
-    if( 0 < rc ) {
+    if( 0 < rc || gpu_device->co_manager_mutex > 0 ) {
         parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task );
+
+        if( 1 == parsec_cuda_delegate_task_completion )
+        {
+
+            /**
+             * @brief
+             * The second thread that push the task to device transitions to
+             * a co-manager.
+             *
+             * 'rc1 == 1' is important or the manager thread will transition
+             * to co-manager. 'co_manager_mutex == 0' will ensure that there
+             * is only one co-manager per device.
+             * the TLS will ensure that one device's co_manager doesn't become another device's co_manager
+             */
+            if( rc1 == 1 && gpu_device->co_manager_mutex == 0 && co_manager_tls_val == NULL )
+            {
+                parsec_cuda_co_manager(es, gpu_device);
+            }
+        }
+
         return PARSEC_HOOK_RETURN_ASYNC;
     }
+    /**
+     * if there is no manager, we cannot by-pass the scheduler with the co-manager
+    */
+    if( co_manager_tls_val != NULL ) {
+        parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) );
+        parsec_atomic_fetch_add_int64(&gpu_device->super.device_load, -gpu_task->load);
+        PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: No manager on device %s, cannot shortcut so back to scheduling %s:%d",
+                             ((parsec_device_module_t*)*co_manager_tls_val)->name, gpu_device->super.name, __FILE__, __LINE__);
+        return PARSEC_HOOK_RETURN_AGAIN;
+    }
     PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Entering GPU management at %s:%d",
                          gpu_device->super.name, __FILE__, __LINE__);
 
@@ -2670,6 +2720,7 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
                 __parsec_reschedule(es, progress_task->ec);
                 gpu_task = progress_task;
                 progress_task = NULL;
+                manager_completing_task = 1;
                 goto remove_gpu_task;
             }
             gpu_task = NULL;
@@ -2727,8 +2778,17 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     } else {
         pop_null++;
         if( pop_null % 1024 == 1023 ) {
-            PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,  "GPU[%s]:\tStill waiting for %d tasks to execute, but poped NULL the last %d times I tried to pop something...",
-                                 gpu_device->super.name, gpu_device->mutex, pop_null);
+            if( gpu_device->mutex > 0 )
+                PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,  "GPU[%s]:\tStill waiting for %d tasks to execute, but poped NULL the last %d times I tried to pop something...",
+                                     gpu_device->super.name, gpu_device->mutex, pop_null);
+            else
+                PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,  "GPU[%s]:\tStill waiting for co-manager, but poped NULL the last %d times I tried to pop something...",
+                                     gpu_device->super.name, pop_null);
+        }
+        if(gpu_device->mutex == 0 && gpu_device->co_manager_mutex == 0) {
+            PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Leaving GPU management at %s:%d",
+                                 gpu_device->super.name, __FILE__, __LINE__);
+            return PARSEC_HOOK_RETURN_ASYNC;
         }
     }
     goto check_in_deps;
@@ -2748,24 +2808,60 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2D_COMPLETE) {
         free( gpu_task->ec );
         gpu_task->ec = NULL;
+        manager_completing_task = 1;
         goto remove_gpu_task;
     }
     parsec_cuda_kernel_epilog( gpu_device, gpu_task );
-    __parsec_complete_execution( es, gpu_task->ec );
     gpu_device->super.executed_tasks++;
+
+    /** The manager will complete the tasks */
+    if( parsec_cuda_delegate_task_completion == 0 )
+    {
+        __parsec_complete_execution( es, gpu_task->ec );
+        manager_completing_task = 1;
+    }
+    /** The co-manager will complete the task. But first check if such a manager is active */
+    else if ( gpu_device->co_manager_mutex > 0 )
+    {
+        parsec_atomic_fetch_inc_int32( &(gpu_device->complete_mutex) );
+        parsec_fifo_push( &(gpu_device->to_complete), (parsec_list_item_t*)gpu_task );
+        manager_completing_task = 0;
+    }
+    /** If the co-manager is not yet ready */
+    else
+    {
+        __parsec_complete_execution( es, gpu_task->ec );
+        manager_completing_task = 1;
+    }
+
  remove_gpu_task:
     parsec_atomic_fetch_add_int64(&gpu_device->super.device_load, -gpu_task->load);
-    PARSEC_DEBUG_VERBOSE(3, parsec_gpu_output_stream,"GPU[%s]: gpu_task %p freed at %s:%d", gpu_device->super.name, 
-                        gpu_task, __FILE__, __LINE__);
-    free( gpu_task );
+
+    /* free the task here only if the manager is completing the task*/
+    if(manager_completing_task == 1)
+    {
+        PARSEC_DEBUG_VERBOSE(3, parsec_gpu_output_stream,"GPU[%s]: gpu_task %p freed at %s:%d", gpu_device->super.name,
+                         gpu_task, __FILE__, __LINE__);
+        free( gpu_task );
+    }
+
     rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) );
+
+    /** Stop the manager from quitting while co_manager is completing tasks
+     * and wait for the co_manager to give you shorcutted tasks
+     */
+    if( gpu_device->complete_mutex > 0 && gpu_device->mutex == 0) {
+        gpu_task = progress_task;
+        goto fetch_task_from_shared_queue;
+    }
+
     if( 1 == rc ) {  /* I was the last one */
 #if defined(PARSEC_PROF_TRACE)
         if( parsec_gpu_trackable_events & PARSEC_PROFILE_GPU_TRACK_OWN )
             PARSEC_PROFILING_TRACE( es->es_profile, parsec_gpu_own_GPU_key_end,
                                     (unsigned long)es, PROFILE_OBJECT_ID_NULL, NULL );
 #endif  /* defined(PARSEC_PROF_TRACE) */
-        PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Leaving GPU management at %s:%d", 
+        PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Leaving GPU management at %s:%d",
                              gpu_device->super.name, __FILE__, __LINE__);
 	/* inform the upper layer not to use the task argument, it has been long gone */
         return PARSEC_HOOK_RETURN_ASYNC;
@@ -2781,4 +2877,91 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     return PARSEC_HOOK_RETURN_DISABLE;
 }
 
-#endif /* PARSEC_HAVE_DEV_CUDA_SUPPORT */
+/**
+ * Co Manager thread in charge of completing the tasks in stead of the
+ * manager itself. While completing the tasks, if any ready tasks are discovered
+ * they will be given back directly to the manager and will not go through the
+ * scheduler
+ */
+parsec_hook_return_t
+parsec_cuda_co_manager( parsec_execution_stream_t *es, parsec_device_gpu_module_t* gpu_device )
+{
+    int rc = 0;
+    parsec_task_t* task = NULL;
+    parsec_gpu_task_t *gpu_task = NULL;
+    parsec_list_t *gpu_tasks_to_free = NULL;
+    (void)es;
+
+    if( gpu_device->co_manager_mutex > 0 )
+    {
+        return PARSEC_HOOK_RETURN_ASYNC;
+    }
+    else
+    {
+        rc = gpu_device->co_manager_mutex;
+        if( !parsec_atomic_cas_int32( &gpu_device->co_manager_mutex, rc, rc+1 ) )
+        {
+            return PARSEC_HOOK_RETURN_ASYNC;
+        }
+    }
+
+    PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Thread %d Entering GPU co-management at %s:%d",
+                         gpu_device->super.name, es->th_id, __FILE__, __LINE__);
+
+    /**
+     * Thread-local variable to identify that this thread is the co-manager
+     * The value allows to differentiate between the different co-managers (debug output mostly)
+     */
+    parsec_device_module_t** co_manager_tls_val = malloc(sizeof(parsec_device_module_t*));
+    *co_manager_tls_val = &(gpu_device->super);
+    PARSEC_TLS_KEY_CREATE(co_manager_tls);
+    PARSEC_TLS_SET_SPECIFIC(co_manager_tls, co_manager_tls_val);
+
+    gpu_tasks_to_free = PARSEC_OBJ_NEW(parsec_list_t);
+
+    /**
+     * The co-manager can be created before any task is to be completed, so wait while mutex > 0
+     * then complete the tasks.
+     */
+    while( gpu_device->mutex > 0 || gpu_device->complete_mutex > 0 )
+    {
+        gpu_task = NULL;
+        task = NULL;
+
+        gpu_task = (parsec_gpu_task_t*)parsec_fifo_pop( &(gpu_device->to_complete) );
+        if( gpu_task != NULL)
+        {
+            task = gpu_task->ec;
+            __parsec_complete_execution( es, task );
+            PARSEC_DEBUG_VERBOSE(4, parsec_gpu_output_stream,"GPU[%s]: gpu_task %p completed by co-manager %d at %s:%d", gpu_device->super.name,
+                                 gpu_task, es->th_id, __FILE__, __LINE__);
+            parsec_atomic_fetch_dec_int32( &(gpu_device->complete_mutex) );
+            parsec_list_push_back(gpu_tasks_to_free, (parsec_list_item_t*)gpu_task);
+        }
+    }
+    /* has to be done as soon as possible because freeing the tasks takes a lot of time */
+    parsec_atomic_fetch_dec_int32( &(gpu_device->co_manager_mutex) );
+    /**
+     * We free the task delegated to the co-manager
+     * because the manager doesn't know when we are done with them
+     */
+    while(NULL != (gpu_task = (parsec_gpu_task_t*)parsec_list_pop_front(gpu_tasks_to_free)) )
+    {
+        PARSEC_DEBUG_VERBOSE(4, parsec_gpu_output_stream,"GPU[%s]: gpu_task %p freed by co-manager %d at %s:%d", gpu_device->super.name,
+                                 gpu_task, es->th_id, __FILE__, __LINE__);
+        free(gpu_task);
+    }
+
+
+
+    PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Thread %d Leaving GPU co-management at %s:%d",
+                             gpu_device->super.name, es->th_id,  __FILE__, __LINE__);
+
+    PARSEC_OBJ_RELEASE(gpu_tasks_to_free);
+    PARSEC_TLS_SET_SPECIFIC(co_manager_tls, NULL);
+    free(co_manager_tls_val);
+
+    return PARSEC_HOOK_RETURN_ASYNC;
+}
+
+#endif /* PARSEC_HAVE_CUDA */
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index b48276027..bbae8d81d 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -129,6 +129,9 @@ struct parsec_device_gpu_module_s {
                                                    *   the index of the set bit device.
                                                    */
     volatile int32_t           mutex;
+    volatile int32_t           complete_mutex;     /** tracks the number of tasks to be completed by the
+                                                    * co-manageer
+                                                    */
     uint64_t                   data_avail_epoch;  /**< Identifies the epoch of the data status on the device. It
                                                    *   is increased every time a new data is made available, so
                                                    *   that we know which tasks can be evaluated for submission.
@@ -136,11 +139,13 @@ struct parsec_device_gpu_module_s {
     parsec_list_t              gpu_mem_lru;   /* Read-only blocks, and fresh blocks */
     parsec_list_t              gpu_mem_owned_lru;  /* Dirty blocks */
     parsec_fifo_t              pending;
+    parsec_fifo_t              to_complete;         /** list of tasks to be completed by the co-manager*/
     struct zone_malloc_s      *memory;
     parsec_list_item_t        *sort_starting_p;
     parsec_gpu_exec_stream_t **exec_stream;
     size_t                     mem_block_size;
     int64_t                    mem_nb_blocks;
+    volatile int32_t           co_manager_mutex;  /** ensures that there is only one co-manager per device */
 };
 
 struct parsec_gpu_exec_stream_s {
diff --git a/parsec/scheduling.c b/parsec/scheduling.c
index abc7abcf0..eed39fd3c 100644
--- a/parsec/scheduling.c
+++ b/parsec/scheduling.c
@@ -363,7 +363,7 @@ __parsec_schedule(parsec_execution_stream_t* es,
  * If the provided execution stream is NULL, all tasks are delivered to their
  * respective vp.
  *
- * Beware, as the manipulation of next_task is not protected, an exeuction
+ * Beware, as the manipulation of next_task is not protected, an execution
  * stream should never be used concurrently in two call to this function (or
  * a thread should never `borrow` an execution stream for this call).
  */