Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions parsec/mca/device/cuda/device_cuda_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -530,12 +530,10 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
gpu_device->memory_free = parsec_cuda_memory_free;
gpu_device->find_incarnation = parsec_cuda_find_incarnation;

if( PARSEC_SUCCESS != parsec_device_memory_reserve(gpu_device,
parsec_cuda_memory_percentage,
parsec_cuda_memory_number_of_blocks,
parsec_cuda_memory_block_size) ) {
goto release_device;
}
/* Device memory initialization is delayed until first device use */
gpu_device->memory_percentage = parsec_cuda_memory_percentage;
gpu_device->number_blocks = parsec_cuda_memory_number_of_blocks;
gpu_device->eltsize = parsec_cuda_memory_block_size;

if( show_caps ) {
parsec_inform("GPU Device %-8s: %s [capability %d.%d] %s\n"
Expand Down
56 changes: 30 additions & 26 deletions parsec/mca/device/device_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -589,13 +589,9 @@ parsec_device_detach( parsec_device_module_t* device, parsec_context_t* context
* can be reserved in a single allocation.
*/
int
parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
int memory_percentage,
int number_blocks,
size_t eltsize )
parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device )
{
int rc;
(void)eltsize;

size_t how_much_we_allocate;
size_t total_mem, initial_free_mem;
Expand All @@ -610,17 +606,17 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
if(PARSEC_SUCCESS != rc)
return rc;

if( number_blocks != -1 ) {
if( number_blocks == 0 ) {
if( gpu_device->number_blocks != -1 ) {
if( gpu_device->number_blocks == 0 ) {
parsec_warning("GPU[%s] Invalid argument: requesting 0 bytes of memory",
gpu_device->super.name);
return PARSEC_ERROR;
} else {
how_much_we_allocate = number_blocks * eltsize;
how_much_we_allocate = gpu_device->number_blocks * gpu_device->eltsize;
}
} else {
/** number_blocks == -1 means memory_percentage is used */
how_much_we_allocate = (memory_percentage * initial_free_mem) / 100;
how_much_we_allocate = (gpu_device->memory_percentage * initial_free_mem) / 100;
}
if( how_much_we_allocate > initial_free_mem ) {
/** Handle the case of jokers who require more than 100% of memory,
Expand All @@ -631,7 +627,7 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
gpu_device->super.name, how_much_we_allocate, initial_free_mem);
how_much_we_allocate = initial_free_mem;
}
if( how_much_we_allocate < eltsize ) {
if( how_much_we_allocate < gpu_device->eltsize ) {
/** Handle another kind of jokers entirely, and cases of
* not enough memory on the device
*/
Expand All @@ -646,12 +642,12 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
* We allocate a bunch of tiles that will be used
* during the computations
*/
while( (free_mem > eltsize )
while( (free_mem > gpu_device->eltsize )
&& ((total_mem - free_mem) < how_much_we_allocate) ) {
parsec_gpu_data_copy_t* gpu_elem;
void *device_ptr;

rc = gpu_device->memory_allocate(gpu_device, eltsize, &device_ptr);
rc = gpu_device->memory_allocate(gpu_device, gpu_device->eltsize, &device_ptr);
if(PARSEC_SUCCESS != rc) {
size_t _free_mem, _total_mem;
gpu_device->memory_info(gpu_device, &_free_mem, &_total_mem );
Expand Down Expand Up @@ -689,15 +685,15 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
void* base_ptr;
/* We allocate all the memory on the GPU and we use our memory management. */
/* This computation leads to allocating more than available if we asked for more than GPU memory */
mem_elem_per_gpu = (how_much_we_allocate + eltsize - 1 ) / eltsize;
size_t total_size = (size_t)mem_elem_per_gpu * eltsize;
mem_elem_per_gpu = (how_much_we_allocate + gpu_device->eltsize - 1 ) / gpu_device->eltsize;
size_t total_size = (size_t)mem_elem_per_gpu * gpu_device->eltsize;

if (total_size > initial_free_mem) {
/* Mapping more than 100% of GPU memory is obviously wrong */
/* Mapping exactly 100% of the GPU memory ends up producing errors about __global__ function call is not configured */
/* Mapping 95% works with low-end GPUs like 1060, how much to let available for gpu runtime, I don't know how to calculate */
total_size = (size_t)((int)(.9*initial_free_mem / eltsize)) * eltsize;
mem_elem_per_gpu = total_size / eltsize;
total_size = (size_t)((int)(.9*initial_free_mem / gpu_device->eltsize)) * gpu_device->eltsize;
mem_elem_per_gpu = total_size / gpu_device->eltsize;
}
rc = gpu_device->memory_allocate(gpu_device, total_size, &base_ptr);
if(PARSEC_SUCCESS != rc) {
Expand All @@ -707,7 +703,7 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
return PARSEC_ERROR;
}

gpu_device->memory = zone_malloc_init( base_ptr, mem_elem_per_gpu, eltsize );
gpu_device->memory = zone_malloc_init( base_ptr, mem_elem_per_gpu, gpu_device->eltsize );

if( gpu_device->memory == NULL ) {
parsec_warning("GPU[%s] Cannot allocate memory on GPU %s. Skip it!",
Expand All @@ -716,10 +712,10 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
}
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s] Allocate %u segments of size %d on the GPU memory",
gpu_device->super.name, mem_elem_per_gpu, eltsize );
gpu_device->super.name, mem_elem_per_gpu, gpu_device->eltsize );
}
#endif
gpu_device->mem_block_size = eltsize;
gpu_device->mem_block_size = gpu_device->eltsize;
gpu_device->mem_nb_blocks = mem_elem_per_gpu;

return PARSEC_SUCCESS;
Expand Down Expand Up @@ -789,7 +785,7 @@ parsec_device_flush_lru( parsec_device_module_t *device )
parsec_device_memory_release_list(gpu_device, &gpu_device->gpu_mem_owned_lru);
parsec_device_free_workspace(gpu_device);
#if !defined(PARSEC_GPU_ALLOC_PER_TILE) && !defined(_NDEBUG)
if( (in_use = zone_in_use(gpu_device->memory)) != 0 ) {
if( NULL != gpu_device->memory && (in_use = zone_in_use(gpu_device->memory)) != 0 ) {
parsec_warning("GPU[%s] memory leak detected: %lu bytes still allocated on GPU",
device->name, in_use);
assert(0);
Expand Down Expand Up @@ -818,12 +814,13 @@ parsec_device_memory_release( parsec_device_gpu_module_t* gpu_device )
parsec_device_flush_lru(&gpu_device->super);

#if !defined(PARSEC_GPU_ALLOC_PER_TILE)
assert( NULL != gpu_device->memory );
void* ptr = zone_malloc_fini(&gpu_device->memory);
rc = gpu_device->memory_free(gpu_device, ptr);
if(PARSEC_SUCCESS != rc) {
parsec_warning("Failed to free the GPU backend memory.");
return rc;
if( NULL != gpu_device->memory ) {
void* ptr = zone_malloc_fini(&gpu_device->memory);
rc = gpu_device->memory_free(gpu_device, ptr);
if(PARSEC_SUCCESS != rc) {
parsec_warning("Failed to free the GPU backend memory.");
return rc;
}
}
#endif

Expand Down Expand Up @@ -859,6 +856,13 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,

(void)copy_readers_update; // potentially unused

if(NULL == gpu_device->memory) {
if( PARSEC_SUCCESS != parsec_device_memory_reserve(gpu_device) ) {
parsec_warning("Could not allocate memory space on device %s", gpu_device->super.name);
return PARSEC_HOOK_RETURN_ERROR;
}
}

/**
* Parse all the input and output flows of data and ensure all have
* corresponding data on the GPU available.
Expand Down
10 changes: 6 additions & 4 deletions parsec/mca/device/device_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,11 @@ struct parsec_device_gpu_module_s {
* is increased every time a new data is made available, so
* that we know which tasks can be evaluated for submission.
*/

int memory_percentage; /**< What % of the memory available on the device we want to use*/
int number_blocks; /**< In case memory_percentage is not set, how many blocks we want to allocate on the device */
size_t eltsize; /**< And what size in byte are these blocks */
Comment on lines +235 to +237
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation seems off by 1

Suggested change
int memory_percentage; /**< What % of the memory available on the device we want to use*/
int number_blocks; /**< In case memory_percentage is not set, how many blocks we want to allocate on the device */
size_t eltsize; /**< And what size in byte are these blocks */
int memory_percentage; /**< What % of the memory available on the device we want to use*/
int number_blocks; /**< In case memory_percentage is not set, how many blocks we want to allocate on the device */
size_t eltsize; /**< And what size in byte are these blocks */


parsec_list_t gpu_mem_lru; /* Read-only blocks, and fresh blocks */
parsec_list_t gpu_mem_owned_lru; /* Dirty blocks */
parsec_fifo_t pending;
Expand Down Expand Up @@ -334,10 +339,7 @@ typedef struct {

#endif /* defined(PROFILING) */

int parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
int memory_percentage,
int number_blocks,
size_t eltsize );
int parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device );
int parsec_device_attach( parsec_device_module_t* device, parsec_context_t* context );
int parsec_device_detach( parsec_device_module_t* device, parsec_context_t* context );
int parsec_device_taskpool_register(parsec_device_module_t* device, parsec_taskpool_t* tp);
Expand Down
10 changes: 4 additions & 6 deletions parsec/mca/device/level_zero/device_level_zero_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -425,12 +425,10 @@ int parsec_level_zero_module_init( int dev_id, parsec_device_level_zero_driver_t
gpu_device->memory_free = parsec_level_zero_memory_free;
gpu_device->find_incarnation = parsec_level_zero_find_incarnation;

if( PARSEC_SUCCESS != parsec_device_memory_reserve(gpu_device,
parsec_level_zero_memory_percentage,
parsec_level_zero_memory_number_of_blocks,
parsec_level_zero_memory_block_size) ) {
goto release_device;
}
/* Device memory initialization is delayed until first device use */
gpu_device->memory_percentage = parsec_level_zero_memory_percentage;
gpu_device->number_blocks = parsec_level_zero_memory_number_of_blocks;
gpu_device->eltsize = parsec_level_zero_memory_block_size;

if( show_caps ) {
parsec_inform("LEVEL ZERO GPU Device %d: %s\n"
Expand Down
3 changes: 3 additions & 0 deletions parsec/scheduling.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,11 @@ int __parsec_execute( parsec_execution_stream_t* es,
task->status = PARSEC_TASK_STATUS_COMPLETE;
if(PARSEC_DEV_RECURSIVE >= tc->incarnations[chore_id].type) {
/* accelerators count their own executed tasks */
#if 0
/* This is wrong: chore_id is the index in incarnations, but it's not the device id */
parsec_device_module_t *dev = parsec_mca_device_get(chore_id);
parsec_atomic_fetch_inc_int64((int64_t*)&dev->executed_tasks);
#endif
Comment on lines +189 to +193
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is that disabled?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because in __parsec_execute (scheduling.c:138-141) we do

   /* Find first bit in chore_mask that is not 0 */
    for(chore_id = 0; NULL != tc->incarnations[chore_id].hook; chore_id++)
        if( 0 != (task->chore_mask & (1<<chore_id)) )
            break;

The way I understand this, this finds the first TYPE of incarnation that we want to execute. If I have X CPUs, Y NVIDIA cards and Z Intel cards, incarnations can hold 3 entries, in any order, not X+Y+Z entries.
Then, once we have chosen the type, the evaluate can decide to skip the type, and then the hook can call get_best_device() to chose which device between the Y NVIDIA cards that are available.
BUT
Later in the file, at line 189, we do

parsec_device_module_t *dev = parsec_mca_device_get(chore_id);
parsec_atomic_fetch_inc_int64((int64_t*)&dev->executed_tasks);

I think this is erroneous, and I think we don't have the device id at this time, it's lost within hook (which calls parsec_get_best_device().

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This counts only CPU and RECURSIVE, not GPUs (it's >=, not <=). The GPU accounting is done separately in device_gpu.c

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. This should be fixed in PR #616 . If you approve PR #616 and we merge it, I'll rebase and remove that part.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see #616 that fixes the underlying problem

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This became incorrect because DTD is allowed to register the chores in any order. If you replace chore_id with tc->incarnations[chore_id].type the problem is solved.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I merged 616, this will need rebasing

}
}
/* Record EXEC_END event only for incarnation that succeeds */
Expand Down