ICLDisco · therault · Jan 22, 2024 · devreal · Jan 22, 2024 · devreal
@@ -530,12 +530,10 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     gpu_device->memory_free      = parsec_cuda_memory_free;
     gpu_device->find_incarnation = parsec_cuda_find_incarnation;
 
-    if( PARSEC_SUCCESS != parsec_device_memory_reserve(gpu_device,
-                                                           parsec_cuda_memory_percentage,
-                                                           parsec_cuda_memory_number_of_blocks,
-                                                           parsec_cuda_memory_block_size) ) {
-        goto release_device;
-    }
+    /* Device memory initialization is delayed until first device use */
+    gpu_device->memory_percentage = parsec_cuda_memory_percentage;
+    gpu_device->number_blocks     = parsec_cuda_memory_number_of_blocks;
+    gpu_device->eltsize           = parsec_cuda_memory_block_size;
 
     if( show_caps ) {
         parsec_inform("GPU Device %-8s: %s [capability %d.%d] %s\n"

@@ -589,13 +589,9 @@ parsec_device_detach( parsec_device_module_t* device, parsec_context_t* context
  * can be reserved in a single allocation.
  */
 int
-parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
-                              int           memory_percentage,
-                              int           number_blocks,
-                              size_t        eltsize )
+parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device )
 {
     int rc;
-    (void)eltsize;
 
     size_t how_much_we_allocate;
     size_t total_mem, initial_free_mem;
@@ -610,17 +606,17 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
     if(PARSEC_SUCCESS != rc)
         return rc;
 
-    if( number_blocks != -1 ) {
-        if( number_blocks == 0 ) {
+    if( gpu_device->number_blocks != -1 ) {
+        if( gpu_device->number_blocks == 0 ) {
             parsec_warning("GPU[%s] Invalid argument: requesting 0 bytes of memory",
                            gpu_device->super.name);
             return PARSEC_ERROR;
         } else {
-            how_much_we_allocate = number_blocks * eltsize;
+            how_much_we_allocate = gpu_device->number_blocks * gpu_device->eltsize;
         }
     } else {
         /** number_blocks == -1 means memory_percentage is used */
-        how_much_we_allocate = (memory_percentage * initial_free_mem) / 100;
+        how_much_we_allocate = (gpu_device->memory_percentage * initial_free_mem) / 100;
     }
     if( how_much_we_allocate > initial_free_mem ) {
         /** Handle the case of jokers who require more than 100% of memory,
@@ -631,7 +627,7 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
                        gpu_device->super.name, how_much_we_allocate, initial_free_mem);
         how_much_we_allocate = initial_free_mem;
     }
-    if( how_much_we_allocate < eltsize ) {
+    if( how_much_we_allocate < gpu_device->eltsize ) {
         /** Handle another kind of jokers entirely, and cases of
          *  not enough memory on the device
          */
@@ -646,12 +642,12 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
      * We allocate a bunch of tiles that will be used
      * during the computations
      */
-    while( (free_mem > eltsize )
+    while( (free_mem > gpu_device->eltsize )
            && ((total_mem - free_mem) < how_much_we_allocate) ) {
         parsec_gpu_data_copy_t* gpu_elem;
         void *device_ptr;
 
-        rc = gpu_device->memory_allocate(gpu_device, eltsize, &device_ptr);
+        rc = gpu_device->memory_allocate(gpu_device, gpu_device->eltsize, &device_ptr);
         if(PARSEC_SUCCESS != rc) {
             size_t _free_mem, _total_mem;
             gpu_device->memory_info(gpu_device, &_free_mem, &_total_mem );
@@ -689,15 +685,15 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
         void* base_ptr;
         /* We allocate all the memory on the GPU and we use our memory management. */
         /* This computation leads to allocating more than available if we asked for more than GPU memory */
-        mem_elem_per_gpu = (how_much_we_allocate + eltsize - 1 ) / eltsize;
-        size_t total_size = (size_t)mem_elem_per_gpu * eltsize;
+        mem_elem_per_gpu = (how_much_we_allocate + gpu_device->eltsize - 1 ) / gpu_device->eltsize;
+        size_t total_size = (size_t)mem_elem_per_gpu * gpu_device->eltsize;
 
         if (total_size > initial_free_mem) {
             /* Mapping more than 100% of GPU memory is obviously wrong */
             /* Mapping exactly 100% of the GPU memory ends up producing errors about __global__ function call is not configured */
             /* Mapping 95% works with low-end GPUs like 1060, how much to let available for gpu runtime, I don't know how to calculate */
-            total_size = (size_t)((int)(.9*initial_free_mem / eltsize)) * eltsize;
-            mem_elem_per_gpu = total_size / eltsize;
+            total_size = (size_t)((int)(.9*initial_free_mem / gpu_device->eltsize)) * gpu_device->eltsize;
+            mem_elem_per_gpu = total_size / gpu_device->eltsize;
         }
         rc = gpu_device->memory_allocate(gpu_device, total_size, &base_ptr);
         if(PARSEC_SUCCESS != rc) { 
@@ -707,7 +703,7 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
             return PARSEC_ERROR;
         }
 
-        gpu_device->memory = zone_malloc_init( base_ptr, mem_elem_per_gpu, eltsize );
+        gpu_device->memory = zone_malloc_init( base_ptr, mem_elem_per_gpu, gpu_device->eltsize );
 
         if( gpu_device->memory == NULL ) {
             parsec_warning("GPU[%s] Cannot allocate memory on GPU %s. Skip it!",
@@ -716,10 +712,10 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
         }
         PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                             "GPU[%s] Allocate %u segments of size %d on the GPU memory",
-                             gpu_device->super.name, mem_elem_per_gpu, eltsize );
+                             gpu_device->super.name, mem_elem_per_gpu, gpu_device->eltsize );
     }
 #endif
-    gpu_device->mem_block_size = eltsize;
+    gpu_device->mem_block_size = gpu_device->eltsize;
     gpu_device->mem_nb_blocks = mem_elem_per_gpu;
 
     return PARSEC_SUCCESS;
@@ -789,7 +785,7 @@ parsec_device_flush_lru( parsec_device_module_t *device )
     parsec_device_memory_release_list(gpu_device, &gpu_device->gpu_mem_owned_lru);
     parsec_device_free_workspace(gpu_device);
 #if !defined(PARSEC_GPU_ALLOC_PER_TILE) && !defined(_NDEBUG)
-    if( (in_use = zone_in_use(gpu_device->memory)) != 0 ) {
+    if( NULL != gpu_device->memory && (in_use = zone_in_use(gpu_device->memory)) != 0 ) {
         parsec_warning("GPU[%s] memory leak detected: %lu bytes still allocated on GPU",
                        device->name, in_use);
         assert(0);
@@ -818,12 +814,13 @@ parsec_device_memory_release( parsec_device_gpu_module_t* gpu_device )
     parsec_device_flush_lru(&gpu_device->super);
 
 #if !defined(PARSEC_GPU_ALLOC_PER_TILE)
-    assert( NULL != gpu_device->memory );
-    void* ptr = zone_malloc_fini(&gpu_device->memory);
-    rc = gpu_device->memory_free(gpu_device, ptr);
-    if(PARSEC_SUCCESS != rc) {
-        parsec_warning("Failed to free the GPU backend memory."); 
-        return rc;
+    if( NULL != gpu_device->memory ) {
+        void* ptr = zone_malloc_fini(&gpu_device->memory);
+        rc = gpu_device->memory_free(gpu_device, ptr);
+        if(PARSEC_SUCCESS != rc) {
+            parsec_warning("Failed to free the GPU backend memory."); 
+            return rc;
+        }
     }
 #endif
 
@@ -859,6 +856,13 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
 
     (void)copy_readers_update; // potentially unused
 
+    if(NULL == gpu_device->memory) {
+        if( PARSEC_SUCCESS != parsec_device_memory_reserve(gpu_device) ) {
+            parsec_warning("Could not allocate memory space on device %s", gpu_device->super.name);
+            return PARSEC_HOOK_RETURN_ERROR;
+        }
+    }
+
     /**
      * Parse all the input and output flows of data and ensure all have
      * corresponding data on the GPU available.

@@ -231,6 +231,11 @@ struct parsec_device_gpu_module_s {
                                                    *   is increased every time a new data is made available, so
                                                    *   that we know which tasks can be evaluated for submission.
                                                    */
+
+    int                         memory_percentage; /**< What % of the memory available on the device we want to use*/
+    int                         number_blocks;     /**< In case memory_percentage is not set, how many blocks we want to allocate on the device */
+    size_t                      eltsize;           /**< And what size in byte are these blocks */ 
-    int                         memory_percentage; /**< What % of the memory available on the device we want to use*/
-    int                         number_blocks;     /**< In case memory_percentage is not set, how many blocks we want to allocate on the device */
-    size_t                      eltsize;           /**< And what size in byte are these blocks */ 
+    int                        memory_percentage; /**< What % of the memory available on the device we want to use*/
+    int                        number_blocks;     /**< In case memory_percentage is not set, how many blocks we want to allocate on the device */
+    size_t                     eltsize;           /**< And what size in byte are these blocks */ 
-    int                         memory_percentage; /**< What % of the memory available on the device we want to use*/
-    int                         number_blocks;     /**< In case memory_percentage is not set, how many blocks we want to allocate on the device */
-    size_t                      eltsize;           /**< And what size in byte are these blocks */ 
+    int                        memory_percentage; /**< What % of the memory available on the device we want to use*/
+    int                        number_blocks;     /**< In case memory_percentage is not set, how many blocks we want to allocate on the device */
+    size_t                     eltsize;           /**< And what size in byte are these blocks */ 
+
     parsec_list_t              gpu_mem_lru;   /* Read-only blocks, and fresh blocks */
     parsec_list_t              gpu_mem_owned_lru;  /* Dirty blocks */
     parsec_fifo_t              pending;
@@ -334,10 +339,7 @@ typedef struct {
 
 #endif  /* defined(PROFILING) */
 
-int parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
-                                      int           memory_percentage,
-                                      int           number_blocks,
-                                      size_t        eltsize );
+int parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device );
 int parsec_device_attach( parsec_device_module_t* device, parsec_context_t* context );
 int parsec_device_detach( parsec_device_module_t* device, parsec_context_t* context );
 int parsec_device_taskpool_register(parsec_device_module_t* device, parsec_taskpool_t* tp);

@@ -425,12 +425,10 @@ int parsec_level_zero_module_init( int dev_id, parsec_device_level_zero_driver_t
     gpu_device->memory_free      = parsec_level_zero_memory_free;
     gpu_device->find_incarnation = parsec_level_zero_find_incarnation;
 
-    if( PARSEC_SUCCESS != parsec_device_memory_reserve(gpu_device,
-                                                           parsec_level_zero_memory_percentage,
-                                                           parsec_level_zero_memory_number_of_blocks,
-                                                           parsec_level_zero_memory_block_size) ) {
-        goto release_device;
-    }
+    /* Device memory initialization is delayed until first device use */
+    gpu_device->memory_percentage = parsec_level_zero_memory_percentage;
+    gpu_device->number_blocks     = parsec_level_zero_memory_number_of_blocks;
+    gpu_device->eltsize           = parsec_level_zero_memory_block_size;
 
     if( show_caps ) {
         parsec_inform("LEVEL ZERO GPU Device %d: %s\n"

@@ -186,8 +186,11 @@ int __parsec_execute( parsec_execution_stream_t* es,
                 task->status = PARSEC_TASK_STATUS_COMPLETE;
                 if(PARSEC_DEV_RECURSIVE >= tc->incarnations[chore_id].type) {
                     /* accelerators count their own executed tasks */
+#if 0
+                    /* This is wrong: chore_id is the index in incarnations, but it's not the device id */
                     parsec_device_module_t *dev = parsec_mca_device_get(chore_id);
                     parsec_atomic_fetch_inc_int64((int64_t*)&dev->executed_tasks);
+#endif
                 }
             }
             /* Record EXEC_END event only for incarnation that succeeds */