Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ option(PARSEC_DIST_WITH_MPI
if(PARSEC_DIST_WITH_MPI AND 0)
message(FATAL_ERROR "PARSEC_DIST_WITH_MPI and PARSEC_DIST_WITH_OTHER are mutually exclusive, please select only one")
endif()
option(PARSEC_MPI_IS_GPU_AWARE
"Build PaRSEC assuming the MPI library is GPU-aware, aka. can move data directly to and from GPU memory.\
As of today (mid 2024) while most MPI support such an option, they require a single process per GPU" ON)
option(PARSEC_DIST_THREAD
"Use an extra thread to progress the data movements" ON)
option(PARSEC_DIST_PRIORITIES
Expand Down
8 changes: 6 additions & 2 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ cat <<EOF
use the MPI communication library [installed in DIR] (default=autodetect)
--enable-collectives
use asynchronous dataflow collective communication

--enable-mpi-gpu-aware
assume the MPI communication library can send/receive from GPU data buffers directly

--with-cuda[=DIR]
use the CUDA accelerator libray [installed in DIR] (default=autodetect)
Expand Down Expand Up @@ -238,6 +239,8 @@ while [ "x$1" != x ]; do
--without-mpi) with_mpi=no; shift;;
--enable-collectives) enable_collectives=yes; shift;;
--disable-collectives) enable_collectives=no; shift;;
--enable-mpi-gpu-aware) enable_mpi_gpu_aware=yes; shift;;
--disable-mpi-gpu-aware) enable_mpi_gpu_aware=no; shift;;

# Hwloc options
--with-hwloc=*) with_hwloc="${1#*=}"; shift;;
Expand Down Expand Up @@ -527,7 +530,8 @@ x) ;;
esac
[ x$enable_collectives = xyes ] && CMAKE_DEFINES+=" -DPARSEC_DIST_COLLECTIVES=ON"
[ x$enable_collectives = xno ] && CMAKE_DEFINES+=" -DPARSEC_DIST_COLLECTIVES=OFF"

[ x$enable_mpi_gpu_aware = xyes ] && CMAKE_DEFINES+=" -DPARSEC_MPI_IS_GPU_AWARE=ON"
[ x$enable_mpi_gpu_aware = xno ] && CMAKE_DEFINES+=" -DPARSEC_MPI_IS_GPU_AWARE=OFF"

case x$with_cuda in
xno) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_CUDA=OFF";;
Expand Down
1 change: 1 addition & 0 deletions parsec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ if( BUILD_PARSEC )
$<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
$<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
$<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
$<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
$<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
${EXTRA_LIBS}
INTERFACE
Expand Down
123 changes: 99 additions & 24 deletions parsec/arena.c
Original file line number Diff line number Diff line change
Expand Up @@ -235,43 +235,118 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
return PARSEC_SUCCESS;
}

parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt)
#include "parsec/utils/zone_malloc.h"
#include "mca/device/device_gpu.h"

#if defined(PARSEC_DEBUG)
static int64_t parsec_countable_incoming_message = 0xF000000000000000;
#endif /* defined(PARSEC_DEBUG) */

static inline parsec_data_copy_t *
parsec_arena_internal_copy_new(parsec_arena_t *arena,
parsec_data_t *data,
size_t count, int device,
parsec_datatype_t dtt)
{
parsec_data_t *data;
parsec_data_copy_t *copy;
int rc;


data = parsec_data_new();
parsec_data_copy_t *copy = NULL;
parsec_data_t* ldata = data;
if( NULL == data ) {
ldata = parsec_data_new();
if( NULL == ldata ) {
return NULL;
}
#if defined(PARSEC_DEBUG)
/* Name the data with a default key to facilitate debuging */
ldata->key = (uint64_t)parsec_atomic_fetch_inc_int64(&parsec_countable_incoming_message);
ldata->key |= ((uint64_t)device) << 56;
#endif /* defined(PARSEC_DEBUG) */
}
if( 0 == device ) {
copy = parsec_data_copy_new(ldata, device, dtt,
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA);
if (NULL == copy) {
goto free_and_return;
}
int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
if (PARSEC_SUCCESS != rc) {
goto free_and_return;
}
return copy;
}
/**
* This part is not really nice, it breaks the separation between devices, and how their memory is
* managed. But, it should give nice perfromance improvements if the communication layer is
* capable of sending or receiving data directly to and from the accelerator memory. The only drawback
* is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
* prior behavior, going through the CPU memory.
*
* The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
* are released from the different LRU lists.
*/
parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)parsec_mca_device_get(device);
if (NULL == gpu_device) {
return NULL;
}
size_t size = count * arena->elem_size;
void* device_private = zone_malloc(gpu_device->memory, size);
if( NULL == device_private ) {
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n",
device, size, (void *)copy->arena_chunk);
goto free_and_return;
}
copy = parsec_data_copy_new(ldata, device, dtt,
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);
if (NULL == copy) {
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n",
device, size, (void *)copy->arena_chunk);
zone_free(gpu_device->memory, device_private);
goto free_and_return;
}
copy->dtt = dtt;
copy->device_private = device_private;
copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
"data ptr %p",
device, size, (void*)copy->arena_chunk, (void*)copy->device_private);
copy->version = 0;
copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
copy->original->owner_device = device;
copy->original->preferred_device = device;
return copy;
free_and_return:
if( NULL != copy )
PARSEC_OBJ_RELEASE(copy);
if( NULL == data)
PARSEC_OBJ_RELEASE(ldata); /* release the locally allocated data */
return NULL;
}

copy = parsec_data_copy_new( data, device, dtt,
PARSEC_DATA_FLAG_ARENA |
PARSEC_DATA_FLAG_PARSEC_OWNED |
PARSEC_DATA_FLAG_PARSEC_MANAGED);
parsec_data_copy_t *
parsec_arena_get_new_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt)
{
parsec_data_copy_t *dev0_copy, *copy;

if(NULL == copy) {
PARSEC_OBJ_RELEASE(data);
dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
if( NULL == dev0_copy ) {
return NULL;
}
dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
dev0_copy->version = 0; /* start from somewhere */
if( 0 == device ) {
return dev0_copy;
}

rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);

copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
if( NULL == copy ) {
copy = dev0_copy; /* return the main memory data copy */
}
/* This data is going to be released once all copies are released
* It does not exist without at least a copy, and we don't give the
* pointer to the user, so we must remove our retain from it
*/
PARSEC_OBJ_RELEASE(data);

if( PARSEC_SUCCESS != rc ) {
PARSEC_OBJ_RELEASE(copy);
return NULL;
}

PARSEC_OBJ_RELEASE(dev0_copy->original);
return copy;
}

Expand Down
10 changes: 5 additions & 5 deletions parsec/arena.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,15 @@ int parsec_arena_construct_ex(parsec_arena_t* arena,
* enough resource to allocate a new data copy of this type.
*/

parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt);
parsec_data_copy_t *parsec_arena_get_new_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt);

/**
* @brief Allocates memory for a given data copy. This is a function used by
* DSLs to set the memory associated with a data copy they have created.
* It is also used by parsec_arena_get_copy.
*
* It is also used by parsec_arena_get_new_copy.
*
* @param copy the (empty) data copy to allocate memory for. NB: the @p original
* field of this data copy must be set. The operation overwrites the device
* dtt and count of this data copy, as well as the device_private pointer.
Expand Down
2 changes: 2 additions & 0 deletions parsec/class/info.c
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,8 @@ void *parsec_info_get(parsec_info_object_array_t *oa, parsec_info_id_t iid)
if(NULL == ie->constructor)
return ret;
nio = ie->constructor(oa->cons_obj, ie->cons_data);
if( NULL == nio )
return ret;
ret = parsec_info_test_and_set(oa, iid, nio, NULL);
if(ret != nio && NULL != ie->destructor) {
ie->destructor(nio, ie->des_data);
Expand Down
Loading
Loading