diff --git a/.github/CI/github_runner.yaml b/.github/CI/github_runner-cpu.yaml
similarity index 100%
rename from .github/CI/github_runner.yaml
rename to .github/CI/github_runner-cpu.yaml
diff --git a/.github/CI/github_runner-gpu_amd.yaml b/.github/CI/github_runner-gpu_amd.yaml
new file mode 100644
index 000000000..839868359
--- /dev/null
+++ b/.github/CI/github_runner-gpu_amd.yaml
@@ -0,0 +1,36 @@
+spack:
+  definitions:
+  - pkgs:
+    - gcc@12.1.0
+    - git
+    - patch
+    - flex
+    - bison
+    - hwloc
+    - unzip
+    - python@3
+    - py-pip
+    - py-pandas
+    - py-matplotlib
+    - py-tables
+    - py-networkx
+    - py-cython
+    - py-wheel
+    - cmake
+    - ninja
+    - otf2@2.3
+    - openmpi
+    - binutils@2.38+headers
+    - hip
+
+  view: true
+  specs:
+    - matrix:
+      - [$pkgs]
+  packages:
+     binutils:
+       buildable: false
+       externals:
+       - spec: binutils@2.38
+         prefix: /usr
+
diff --git a/.github/CI/github_runner-gpu_nvidia.yaml b/.github/CI/github_runner-gpu_nvidia.yaml
new file mode 100644
index 000000000..b5c152b43
--- /dev/null
+++ b/.github/CI/github_runner-gpu_nvidia.yaml
@@ -0,0 +1,35 @@
+spack:
+  definitions:
+  - pkgs:
+    - gcc@12.1.0
+    - git
+    - patch
+    - flex
+    - bison
+    - hwloc
+    - unzip
+    - python@3
+    - py-pip
+    - py-pandas
+    - py-matplotlib
+    - py-tables
+    - py-networkx
+    - py-cython
+    - py-wheel
+    - cmake
+    - ninja
+    - otf2@2.3
+    - openmpi
+    - cuda@12
+
+  view: true
+  specs:
+    - matrix:
+      - [$pkgs]
+  packages:
+     binutils:
+       buildable: false
+       externals:
+       - spec: binutils@2.38
+         prefix: /usr
+
diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml
index c6b292f12..aee4c5cab 100644
--- a/.github/workflows/build_cmake.yml
+++ b/.github/workflows/build_cmake.yml
@@ -19,18 +19,22 @@ env:
 
 jobs:
   debug:
-    runs-on: [self-hosted, Linux]
     strategy:
       fail-fast: false
       matrix:
         build_type : [ Debug ]
         shared_type : [ OFF, ON ]
         profiling : [ ON ]
+        device : [cpu, gpu_nvidia, gpu_amd]
 
-    name: "Type=${{ matrix.build_type }} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
+    runs-on: ${{matrix.device}}
+
+    name: "Type=${{ matrix.build_type }} device=${{matrix.device}} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
     env:
       BUILD_DIRECTORY : "${{github.workspace}}/build/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
       INSTALL_DIRECTORY : "${{github.workspace}}/install/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
+      RUNNER_ENV : github_runner-${{matrix.device}}
+      DEVICE_ENV : ${{matrix.device}}
       BUILD_CONFIG : >
         -G Ninja
         -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -40,6 +44,7 @@ jobs:
         -DPARSEC_PROF_TRACE=${{ matrix.profiling }}
         -DMPIEXEC_PREFLAGS='--bind-to;none;--oversubscribe'
         -DCMAKE_INSTALL_PREFIX=$INSTALL_DIRECTORY
+        -DPARSEC_REQUIRE_DEVICE_TEST=${{matrix.device}}
 
     steps:
     - uses: actions/checkout@v2
@@ -104,18 +109,22 @@ jobs:
         path: ${{ env.BUILD_DIRECTORY }}/CMakeFiles/CMakeError.log
   release:
     needs: debug
-    runs-on: [self-hosted, Linux]
     strategy:
       fail-fast: false
       matrix:
         build_type : [ Release ]
         shared_type : [ ON ]
         profiling : [ OFF, ON ]
+        device : [cpu, gpu_nvidia, gpu_amd]
+
+    runs-on: ${{matrix.device}}
 
-    name: "Type=${{ matrix.build_type }} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
+    name: "Type=${{ matrix.build_type }} device=${{matrix.device}} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
     env:
       BUILD_DIRECTORY : "${{github.workspace}}/build/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
       INSTALL_DIRECTORY : "${{github.workspace}}/install/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
+      RUNNER_ENV : github_runner-${{matrix.device}}
+      DEVICE_ENV : ${{matrix.device}}
       BUILD_CONFIG : >
         -G Ninja
         -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -124,6 +133,7 @@ jobs:
         -DPARSEC_PROF_TRACE=${{ matrix.profiling }}
         -DMPIEXEC_PREFLAGS='--bind-to;none;--oversubscribe'
         -DCMAKE_INSTALL_PREFIX=$INSTALL_DIRECTORY
+        -DPARSEC_REQUIRE_DEVICE_TEST=${{matrix.device}}
 
     steps:
     - uses: actions/checkout@v2
@@ -159,7 +169,14 @@ jobs:
       # The CMake binaries on the Github Actions machines are (as of this writing) 3.12
       run: |
         source ${{github.workspace}}/.github/CI/spack_setup.sh
-        cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG
+        if [ "${{matrix.device}}" == "gpu_amd" ]; then
+          cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG -DPARSEC_GPU_WITH_HIP=ON -DPARSEC_GPU_WITH_CUDA=OFF
+        elif [ "${{matrix.device}}" == "gpu_nvidia" ]; then
+          cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG -DPARSEC_GPU_WITH_HIP=OFF -DPARSEC_GPU_WITH_CUDA=ON
+        else
+          cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG -DPARSEC_GPU_WITH_HIP=OFF -DPARSEC_GPU_WITH_CUDA=OFF
+        fi
+
 
     - name: Build
       working-directory: ${{ env.BUILD_DIRECTORY }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0b0b823e..dcd12eb77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,9 @@ if(POLICY CMP0098)
   # CMP0098: New in version 3.17, FindFLEX runs flex in directory CMAKE_CURRENT_BINARY_DIR when executing.
   cmake_policy(SET CMP0098 NEW)
 endif(POLICY CMP0098)
+if(POLICY CMP0104 AND NOT CUDA_ARCHITECTURES)
+  set(CUDA_ARCHITECTURES OFF)
+endif()
 
 set(CMAKE_NO_SYSTEM_FROM_IMPORTED True)
 # On OSX only find the Apple frameworks is nothing else is available.
@@ -75,6 +78,11 @@ include(CTest)
 # ccmake tunable parameters
 #####
 
+# CTest related options
+set(PARSEC_REQUIRE_DEVICE_TEST "NONE" CACHE STRING "Make tests fail if specified device support is disabled (default NONE, valid values are HIP or amd, CUDA or nvidia, or NONE or cpu). The intended use is to ensure that device tests are passed in CI, and avoid failing silently if there is no GPU on the target system.")
+set_property(CACHE PARSEC_REQUIRE_DEVICE_TEST PROPERTY STRINGS "NONE" "HIP" "CUDA" "cpu" "gpu_amd" "gpu_nvidia")
+mark_as_advanced(PARSEC_REQUIRE_DEVICE_TEST)
+
 ## Check for the support of additional languages and capabilities
 option(SUPPORT_FORTRAN
        "Enable support for Fortran bindings (default ON)" ON)
@@ -123,6 +131,9 @@ mark_as_advanced(BUILD_PARSEC)
 ### Misc options
 option(BUILD_SHARED_LIBS
     "Build shared libraries" ON)
+if(BUILD_SHARED_LIBS)
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+endif(BUILD_SHARED_LIBS)
 option(BUILD_64bits
   "Build 64 bits mode" ON)
 if(NOT CMAKE_BUILD_TYPE)
@@ -717,6 +728,7 @@ int main(int argc, char *argv[]) {
       if(CMAKE_CUDA_COMPILER)
         enable_language(CUDA)
       endif(CMAKE_CUDA_COMPILER)
+      cmake_pop_check_state()
     endif (CUDAToolkit_FOUND)
     set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
   endif( PARSEC_GPU_WITH_CUDA )
@@ -724,9 +736,8 @@ int main(int argc, char *argv[]) {
   if( PARSEC_GPU_WITH_HIP )
     # This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents
     set(CMAKE_SYSTEM_PREFIX_PATH_save ${CMAKE_SYSTEM_PREFIX_PATH})
-    list(APPEND CMAKE_SYSTEM_PREFIX_PATH /opt/rocm)
-    find_package(HIP 5 QUIET) #quiet because hip-config.cmake is not part of core-cmake and will spam a loud warning when hip/rocm is not installed
-    set(CMAKE_SYSTEM_PREFIX_PATH ${CMAKE_SYSTEM_PREFIX_PATH_save})
+    list(APPEND CMAKE_SYSTEM_PREFIX_PATH $ENV{ROCM_PATH}/lib/cmake)
+    find_package(HIP QUIET) #quiet because hip-config.cmake is not part of core-cmake and will spam a loud warning when hip/rocm is not installed
     if(HIP_FOUND AND PARSEC_HAVE_CUDA)
       # the underlying reason is that the generated ptg code cannot include at the same time
       # cuda_runtime.h and hip_runtime.h, so we need to modify the dev_cuda.h to not expose any
@@ -738,6 +749,8 @@ int main(int argc, char *argv[]) {
       get_target_property(extra_hip_libs hip::host INTERFACE_LINK_LIBRARIES)
       list(APPEND EXTRA_LIBS ${extra_hip_libs})
       set(HIP_NOT_CUDA_FOUND TRUE)
+      enable_language(HIP)
+      set(CMAKE_SYSTEM_PREFIX_PATH ${CMAKE_SYSTEM_PREFIX_PATH_save})
     else()
       set(HIP_NOT_CUDA_FOUND FALSE)
     endif()
@@ -747,8 +760,8 @@ int main(int argc, char *argv[]) {
   if( PARSEC_GPU_WITH_LEVEL_ZERO )
     find_package(level-zero)
     find_package(DPCPP)
-    set(PARSEC_HAVE_LEVEL_ZERO ${LEVEL_ZERO_FOUND} CACHE BOOL "True if PaRSEC provide support for Intel level-zero")
     if (LEVEL_ZERO_FOUND AND PARSEC_HAVE_DPCPP)
+      set(PARSEC_HAVE_LEVEL_ZERO ${LEVEL_ZERO_FOUND} CACHE BOOL "True if PaRSEC provide support for Intel level-zero")
       include_directories("${LEVEL_ZERO_INCLUDE_DIR}/level_zero/")
       set(PARSEC_HAVE_LEVEL_ZERO ${LEVEL_ZERO_FOUND} CACHE BOOL "True if PaRSEC provide support for Intel Level Zero")
       message(STATUS "Found Intel level-zero ${LEVEL_ZERO_VERSION} in -I${LEVEL_ZERO_INCLUDE_DIR} / -L${LEVEL_ZERO_LIBRARY_DIR}")
@@ -939,6 +952,7 @@ add_subdirectory(parsec)
 # Add dependency to Level-Zero if it is enabled
 #
 if(PARSEC_HAVE_LEVEL_ZERO)
+    message(STATUS "parsec depends on ze_loader")
     target_link_libraries(parsec PRIVATE level_zero::ze_loader)
 endif(PARSEC_HAVE_LEVEL_ZERO)
 
diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c
index 2f0c134aa..eb68be4c1 100644
--- a/parsec/interfaces/dtd/insert_function.c
+++ b/parsec/interfaces/dtd/insert_function.c
@@ -41,6 +41,9 @@
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
 #include "parsec/mca/device/cuda/device_cuda.h"
 #endif  /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+#include "parsec/mca/device/hip/device_hip.h"
+#endif  /* defined(PARSEC_HAVE_DEV_HIP_SUPPORT) */
 
 #include "parsec/mca/mca_repository.h"
 #include "parsec/constants.h"
@@ -1491,9 +1494,8 @@ parsec_dtd_startup(parsec_context_t *context,
         parsec_device_module_t *device = parsec_mca_device_get(_i);
         if( NULL == device ) continue;
         if( !(tp->devices_index_mask & (1 << device->device_index))) continue;  /* not supported */
-        // If CUDA is enabled, let the CUDA device activated for this
-        // taskpool.
-        if( PARSEC_DEV_CUDA == device->type ) continue;
+        // If a GPU is enabled, let the device be activated for this taskpool.
+        if( PARSEC_DEV_IS_GPU(device->type) ) continue;
         if( NULL != device->taskpool_register )
             if( PARSEC_SUCCESS !=
                 device->taskpool_register(device, (parsec_taskpool_t *)tp)) {
@@ -2327,7 +2329,7 @@ static parsec_hook_return_t parsec_dtd_gpu_task_submit(parsec_execution_stream_t
     }
 
     parsec_device_module_t *device = parsec_mca_device_get(dev_index);
-    assert(NULL != device);
+     assert(NULL != device);
     /* We already know the device is a GPU device from the test above */
     gpu_task->stage_in  = parsec_default_gpu_stage_in;
     gpu_task->stage_out = parsec_default_gpu_stage_out;
@@ -2400,7 +2402,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
     }
 
     incarnations[i].type = device_type;
-    if(PARSEC_DEV_CUDA == device_type) {
+    if(PARSEC_DEV_IS_GPU(device_type)) {
         incarnations[i].hook = parsec_dtd_gpu_task_submit;
         dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;
     }
@@ -2998,11 +3000,11 @@ parsec_insert_dtd_task(parsec_task_t *__this_task)
                         FLOW_OF(last_user.task, last_user.flow_index)->flags &= ~RELEASE_OWNERSHIP_SPECIAL;
 
                         if( this_task->super.data[flow_index].data_in != NULL) {
-/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
+/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) */
 /*                            parsec_atomic_lock(&this_task->super.data[flow_index].data_in->original->lock); */
 /* #endif */
                             (void)parsec_atomic_fetch_dec_int32(&this_task->super.data[flow_index].data_in->readers);
-/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
+/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) */
 /*                            parsec_atomic_unlock(&this_task->super.data[flow_index].data_in->original->lock); */
 /* #endif */
                         }
@@ -3287,8 +3289,8 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
 
             __parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations;
             (*incarnations)[0].type = device_type;
-            if( device_type == PARSEC_DEV_CUDA ) {
-                /* Special case for CUDA: we need an intermediate */
+            if( PARSEC_DEV_IS_GPU(device_type) ) {
+                /* Special case for GPUs: we need an intermediate */
                 (*incarnations)[0].hook = parsec_dtd_gpu_task_submit;
                 dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer;
             }
diff --git a/parsec/interfaces/dtd/insert_function_internal.h b/parsec/interfaces/dtd/insert_function_internal.h
index b59b63452..7a5ba0429 100644
--- a/parsec/interfaces/dtd/insert_function_internal.h
+++ b/parsec/interfaces/dtd/insert_function_internal.h
@@ -21,10 +21,6 @@
 #include "parsec/execution_stream.h"
 #include "parsec/mca/device/device_gpu.h"
 
-#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
-#include "parsec/mca/device/cuda/device_cuda.h"
-#endif /* PARSEC_HAVE_DEV_CUDA_SUPPORT */
-
 BEGIN_C_DECLS
 
 #define PARSEC_DTD_NB_TASK_CLASSES  25 /*< Max number of task classes allowed */
diff --git a/tests/dsl/dtd/dtd_test_cuda_task_insert.c b/tests/dsl/dtd/dtd_test_cuda_task_insert.c
index 9b689db73..2cea48fca 100644
--- a/tests/dsl/dtd/dtd_test_cuda_task_insert.c
+++ b/tests/dsl/dtd/dtd_test_cuda_task_insert.c
@@ -4,6 +4,7 @@
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 #include "parsec/interfaces/dtd/insert_function_internal.h"
 #include "tests/tests_data.h"
+#include "parsec/mca/device/cuda/device_cuda_internal.h"
 
 #if defined(PARSEC_HAVE_MPI)
 #include <mpi.h>
diff --git a/tests/dsl/dtd/dtd_test_new_tile.c b/tests/dsl/dtd/dtd_test_new_tile.c
index 36f6b99da..f4a869925 100644
--- a/tests/dsl/dtd/dtd_test_new_tile.c
+++ b/tests/dsl/dtd/dtd_test_new_tile.c
@@ -9,6 +9,9 @@
 #include "tests/tests_timing.h"
 #include "parsec/interfaces/dtd/insert_function_internal.h"
 #include "parsec/utils/debug.h"
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+#include "parsec/mca/device/cuda/device_cuda_internal.h"
+#endif
 
 #if defined(PARSEC_HAVE_STRING_H)
 #include <string.h>
diff --git a/tests/runtime/CMakeLists.txt b/tests/runtime/CMakeLists.txt
index 563704f90..3431a4e13 100644
--- a/tests/runtime/CMakeLists.txt
+++ b/tests/runtime/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(scheduling)
-add_Subdirectory(cuda)
+add_Subdirectory(gpu)
 
 if( MPI_C_FOUND )
   parsec_addtest_executable(C multichain)
diff --git a/tests/runtime/Testings.cmake b/tests/runtime/Testings.cmake
index bec60bba4..b222ec793 100644
--- a/tests/runtime/Testings.cmake
+++ b/tests/runtime/Testings.cmake
@@ -1,2 +1,2 @@
 include(runtime/scheduling/Testings.cmake)
-include(runtime/cuda/Testings.cmake)
+include(runtime/gpu/Testings.cmake)
diff --git a/tests/runtime/cuda/Testings.cmake b/tests/runtime/cuda/Testings.cmake
deleted file mode 100644
index 231eaed3e..000000000
--- a/tests/runtime/cuda/Testings.cmake
+++ /dev/null
@@ -1,13 +0,0 @@
-# commented out tests disabled because they cause CI to fail. They should be re-enabled when underlying issue identified.
-if(PARSEC_HAVE_CUDA)
-  #  parsec_addtest_cmd(runtime/cuda/get_best_device:gpu ${SHM_TEST_CMD_LIST} ${CTEST_CUDA_LAUNCHER_OPTIONS} runtime/cuda/testing_get_best_device -N 400 -t 20 -g 4)
-  if(TARGET nvlink)
-    parsec_addtest_cmd(runtime/cuda/nvlink:gpu ${SHM_TEST_CMD_LIST} ${CTEST_CUDA_LAUNCHER_OPTIONS} runtime/cuda/nvlink --mca device_cuda_enable 2)
-  endif()
-  if(TARGET stress)
-    #    parsec_addtest_cmd(runtime/cuda/stress:gpu ${SHM_TEST_CMD_LIST} ${CTEST_CUDA_LAUNCHER_OPTIONS} runtime/cuda/stress)
-  endif()
-  if(TARGET stage)
-    #    parsec_addtest_cmd(runtime/cuda/stage:gpu ${SHM_TEST_CMD_LIST} ${CTEST_CUDA_LAUNCHER_OPTIONS} runtime/cuda/stage)
-  endif()
-endif()
diff --git a/tests/runtime/cuda/CMakeLists.txt b/tests/runtime/gpu/CMakeLists.txt
similarity index 64%
rename from tests/runtime/cuda/CMakeLists.txt
rename to tests/runtime/gpu/CMakeLists.txt
index fbb5a5022..6c76a3178 100644
--- a/tests/runtime/cuda/CMakeLists.txt
+++ b/tests/runtime/gpu/CMakeLists.txt
@@ -25,4 +25,24 @@ if(PARSEC_HAVE_CUDA)
   parsec_addtest_executable(C testing_get_best_device SOURCES "testing_get_best_device.c")
   target_include_directories(testing_get_best_device PRIVATE $<$<NOT:${PARSEC_BUILD_INPLACE}>:${CMAKE_CURRENT_SOURCE_DIR}>)
   target_ptg_sources(testing_get_best_device PRIVATE "get_best_device_check.jdf")
+
+  if(CMAKE_CUDA_COMPILER)
+    set_source_files_properties(ping_kernel.cu PROPERTIES LANGUAGE CUDA)
+    parsec_addtest_executable(C dtd_pingpong SOURCES dtd_pingpong.c)
+    target_sources(dtd_pingpong PRIVATE ping_kernel.cu)
+
+    parsec_addtest_executable(C ptg_pingpong SOURCES ping_kernel.cu)
+    target_ptg_sources(ptg_pingpong PRIVATE "ptg_pingpong.jdf")
+  endif(CMAKE_CUDA_COMPILER)
 endif(PARSEC_HAVE_CUDA)
+
+if(PARSEC_HAVE_HIP)
+  if(CMAKE_HIP_COMPILER)
+    include(ParsecCompilePTG)
+    set_source_files_properties(ping_kernel.hip.c PROPERTIES LANGUAGE HIP)
+    parsec_addtest_executable(C dtd_pingpong SOURCES dtd_pingpong.c ping_kernel.hip.c)
+
+    parsec_addtest_executable(C ptg_pingpong SOURCES ping_kernel.hip.c)
+    target_ptg_sources(ptg_pingpong PRIVATE "ptg_pingpong.jdf")
+  endif(CMAKE_HIP_COMPILER)
+endif(PARSEC_HAVE_HIP)
diff --git a/tests/runtime/gpu/Testings.cmake b/tests/runtime/gpu/Testings.cmake
new file mode 100644
index 000000000..5b26efba2
--- /dev/null
+++ b/tests/runtime/gpu/Testings.cmake
@@ -0,0 +1,61 @@
+foreach(_UDEV "HIP" "CUDA")
+  string(TOLOWER ${_UDEV} _LDEV)
+  # This is for CI convenience only: github runners are called gpu_nvidia and gpu_amd, and we want to re-use those names
+  # to simplify writing the build_file.yml. We use those as aliases for possible PARSEC_REQUIRE_DEVICE_TEST
+  if("${_UDEV}" STREQUAL "CUDA")
+    set(_GHR_DEVNAME "gpu_nvidia")
+  endif()
+  if("${_UDEV}" STREQUAL "HIP")
+    set(_GHR_DEVNAME "gpu_amd")
+  endif()
+  if("${_UDEV}" STREQUAL "NONE")
+    set(_GHR_DEVNAME "cpu")
+  endif()
+
+  if("${PARSEC_REQUIRE_DEVICE_TEST}" STREQUAL "${_UDEV}" OR "${PARSEC_REQUIRE_DEVICE_TEST}" STREQUAL "${_GHR_DEVNAME}")
+    # If we require testing CUDA or HIP, we force to fail if the target device was not detected
+    if(NOT PARSEC_HAVE_${_UDEV})
+     add_test(NAME runtime/gpu/device_support:${_LDEV} COMMAND false)
+    else(NOT PARSEC_HAVE_${_UDEV})
+      add_test(NAME runtime/gpu/device_support:${_LDEV} COMMAND true)
+    endif(NOT PARSEC_HAVE_${_UDEV})
+    set_property(TEST runtime/gpu/device_support:${_LDEV} PROPERTY FIXTURES_SETUP have_${_LDEV}_support)
+  endif("${PARSEC_REQUIRE_DEVICE_TEST}" STREQUAL "${_UDEV}" OR "${PARSEC_REQUIRE_DEVICE_TEST}" STREQUAL "${_GHR_DEVNAME}")
+
+  # commented out tests disabled because they cause CI to fail. They should be re-enabled when underlying issue identified.
+  # We sanity-check that we're running on a machine that can has at least 1 GPU, and then try to run the tests with up to 8 GPUs / process
+  # The test itself is supposed to adapt to any number between 1 and 8, and fail if the actual number of GPUs does not allow
+  # to test the feature targeted.
+  if(PARSEC_HAVE_${_UDEV})
+    # If we required to test ${_UDEV}, and we try on a machine without any ${_UDEV} available, this test will fail
+    # To reduce CI time, no other ${_LDEV} test will be run after this fails.
+    add_test(NAME runtime/gpu/device_present:${_LDEV} COMMAND ${SHM_TEST_CMD_LIST} ${CTEST_${_UDEV}_LAUNCHER_OPTIONS} sh ${PROJECT_SOURCE_DIR}/tests/runtime/gpu/check_nb_devices.sh ${_UDEV} 1)
+    set_property(TEST runtime/gpu/device_present:${_LDEV} PROPERTY FIXTURES_REQUIRED have_${_LDEV}_support)
+    set_property(TEST runtime/gpu/device_present:${_LDEV} PROPERTY FIXTURES_SETUP tester_has_${_LDEV}_device)
+
+    if(TARGET get_best_device)
+      #  parsec_addtest_cmd(NAME runtime/gpu/get_best_device:${_LDEV} COMMAND ${SHM_TEST_CMD_LIST} ${CTEST_${_UDEV}_LAUNCHER_OPTIONS} runtime/gpu/testing_get_best_device -N 400 -t 20 -g 4)
+      #  set_property(TEST runtime/gpu/get_best_device:${_LDEV} PROPERTY FIXTURES_REQUIRED tester_has_${_LDEV}_device)
+    endif(TARGET get_best_device)
+    if(TARGET dtd_pingpong)
+      parsec_addtest_cmd(runtime/gpu/dtd_pingpong:${_LDEV} COMMAND ${SHM_TEST_CMD_LIST} ${CTEST_${_UDEV}_LAUNCHER_OPTIONS} runtime/gpu/dtd_pingpong --mca device_${_LDEV}_enable 8 --mca device ${_LDEV})
+      set_property(TEST runtime/gpu/dtd_pingpong:${_LDEV} PROPERTY FIXTURES_REQUIRED tester_has_${_LDEV}_device)
+    endif(TARGET dtd_pingpong)
+    if(TARGET ptg_pingpong)
+      parsec_addtest_cmd(runtime/gpu/ptg_pingpong:${_LDEV} COMMAND ${SHM_TEST_CMD_LIST} ${CTEST_${_UDEV}_LAUNCHER_OPTIONS} runtime/gpu/ptg_pingpong --mca device_${_LDEV}_enable 8 --mca device ${_LDEV})
+      set_property(TEST runtime/gpu/ptg_pingpong:${_LDEV} PROPERTY FIXTURES_REQUIRED tester_has_${_LDEV}_device)
+      endif(TARGET ptg_pingpong)
+    if(TARGET stress)
+      #  parsec_addtest_cmd(runtime/gpu/stress:${_LDEV} ${SHM_TEST_CMD_LIST} ${CTEST_${_UDEV}_LAUNCHER_OPTIONS} runtime/gpu/stress)
+      #  set_property(TEST runtime/gpu/stress:${_LDEV} PROPERTY FIXTURES_REQUIRED tester_has_${_LDEV}_device)
+    endif()
+    if(TARGET stage)
+      #  parsec_addtest_cmd(runtime/gpu/stage:${_LDEV} ${SHM_TEST_CMD_LIST} ${CTEST_${_UDEV}_LAUNCHER_OPTIONS} runtime/gpu/stage)
+      #  set_property(TEST runtime/gpu/stage:${_LDEV} PROPERTY FIXTURES_REQUIRED tester_has_${_LDEV}_device)
+    endif()
+    if(TARGET nvlink)
+      parsec_addtest_cmd(runtime/gpu/nvlink:${_LDEV} ${SHM_TEST_CMD_LIST} ${CTEST_${_UDEV}_LAUNCHER_OPTIONS} runtime/gpu/nvlink --mca device_${_LDEV}_enable 8 --mca device ${_LDEV})
+      set_property(TEST runtime/gpu/nvlink:${_LDEV} PROPERTY FIXTURES_REQUIRED tester_has_${_LDEV}_device)
+    endif()
+  endif()
+endforeach()
diff --git a/tests/runtime/gpu/check_nb_devices.sh b/tests/runtime/gpu/check_nb_devices.sh
new file mode 100755
index 000000000..7bdc48b8e
--- /dev/null
+++ b/tests/runtime/gpu/check_nb_devices.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+if [ $1 == "CUDA" ]; then
+  NB=$(nvidia-smi -L | wc -l)
+  if [ $NB -gt $2 ]; then
+    exit 0
+  fi
+fi
+
+if [ $1 == "HIP" ]; then
+  NB=$(rocm-smi --showuniqueid | grep 'Unique ID:' | wc -l)
+  if [ $NB -gt $2 ]; then
+    exit 0
+  fi
+fi
+
+exit 1
diff --git a/tests/runtime/cuda/cuda_test_internal.h b/tests/runtime/gpu/cuda_test_internal.h
similarity index 100%
rename from tests/runtime/cuda/cuda_test_internal.h
rename to tests/runtime/gpu/cuda_test_internal.h
diff --git a/tests/runtime/gpu/dtd_pingpong.c b/tests/runtime/gpu/dtd_pingpong.c
new file mode 100644
index 000000000..25dcae1a0
--- /dev/null
+++ b/tests/runtime/gpu/dtd_pingpong.c
@@ -0,0 +1,292 @@
+/* parsec things */
+#include "parsec/runtime.h"
+
+/* system and io */
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "tests/tests_data.h"
+#include "tests/tests_timing.h"
+#include "parsec/interfaces/dtd/insert_function_internal.h"
+#include "parsec/utils/debug.h"
+
+#if defined(PARSEC_HAVE_STRING_H)
+#include <string.h>
+#endif  /* defined(PARSEC_HAVE_STRING_H) */
+
+#if defined(PARSEC_HAVE_MPI)
+#include <mpi.h>
+#endif  /* defined(PARSEC_HAVE_MPI) */
+
+static int TILE_FULL;
+static int32_t nb_errors = 0;
+static int verbose=0;
+
+#define NLOOP 8
+
+int cpu_ping( parsec_execution_stream_t *es,
+                  parsec_task_t *this_task )
+{
+    (void)es;
+    int *data;
+    int rank;
+    int nb;
+    parsec_dtd_unpack_args(this_task, &rank, &data, &nb);
+
+    if(verbose)
+      fprintf(stderr, "cpu_ping(): on CPU of MPI rank %d data of size %d\n", es->virtual_process->parsec_context->my_rank, nb);
+
+    for(int idx = 0; idx < nb; idx ++)
+        data[idx] = idx;
+
+    return PARSEC_HOOK_RETURN_DONE;
+}
+
+int cpu_pong( parsec_execution_stream_t *es,
+                  parsec_task_t *this_task )
+{
+    (void)es;
+    int *data;
+    int rank, idx;
+    parsec_dtd_unpack_args(this_task, &rank, &data, &idx);
+
+    if(verbose)
+      fprintf(stderr, "cpu_pong(%d): on CPU of MPI rank %d\n", idx, es->virtual_process->parsec_context->my_rank);
+
+    data[idx] += idx;
+
+    return PARSEC_HOOK_RETURN_DONE;
+}
+
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+extern void cuda_pong_kernel(int *dev_data, int idx);
+
+int cuda_pong(parsec_device_gpu_module_t *gpu_device,
+              parsec_gpu_task_t *gpu_task,
+              parsec_gpu_exec_stream_t *gpu_stream)
+{
+    (void)gpu_device;
+    (void)gpu_stream;
+
+    int *data;
+    void *dev_data;
+    int rank, idx;
+
+    parsec_task_t *this_task = gpu_task->ec;
+    parsec_dtd_unpack_args(this_task, &rank, &data, &idx);
+
+    if(verbose)
+      fprintf(stderr, "gpu_pong(%d): on GPU %s of MPI rank %d\n", idx, gpu_device->super.name, this_task->taskpool->context->my_rank);
+
+    dev_data = parsec_dtd_get_dev_ptr(this_task, 0);
+
+    cuda_pong_kernel(dev_data, idx);
+
+    return PARSEC_HOOK_RETURN_DONE;
+}
+#endif
+
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+extern void hip_pong_kernel(int *dev_data, int idx);
+
+int hip_pong(parsec_device_gpu_module_t *gpu_device,
+              parsec_gpu_task_t *gpu_task,
+              parsec_gpu_exec_stream_t *gpu_stream)
+{
+    (void)gpu_device;
+    (void)gpu_stream;
+
+    int *data;
+    void *dev_data;
+    int rank, idx;
+
+    parsec_task_t *this_task = gpu_task->ec;
+    parsec_dtd_unpack_args(this_task, &rank, &data, &idx);
+
+    if(verbose)
+      fprintf(stderr, "hip_pong(%d): on GPU %s of MPI rank %d\n", idx, gpu_device->super.name, this_task->taskpool->context->my_rank);
+
+    dev_data = parsec_dtd_get_dev_ptr(this_task, 0);
+
+    hip_pong_kernel(dev_data, idx);
+
+    return PARSEC_HOOK_RETURN_DONE;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+    parsec_context_t* parsec;
+    int rank, world, cores = -1;
+    int nb, rc, nb_gpus = 0;
+    parsec_arena_datatype_t *adt;
+    parsec_device_module_t **gpu_devices = NULL;
+
+#if defined(PARSEC_HAVE_MPI)
+    {
+        int provided;
+        MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+        if(MPI_THREAD_MULTIPLE > provided) {
+            parsec_fatal( "This benchmark requires MPI_THREAD_MULTIPLE because it uses simultaneously MPI within the PaRSEC runtime, and in the main program loop");
+        }
+    }
+    MPI_Comm_size(MPI_COMM_WORLD, &world);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#else
+    world = 1;
+    rank = 0;
+#endif
+
+    parsec = parsec_init( cores, &argc, &argv );
+#if defined(PARSEC_PROF_TRACE)
+    parsec_profiling_start();
+#endif
+
+    for(unsigned int i = 0; i < parsec_nb_devices; i++) {
+        parsec_device_module_t *dev = parsec_mca_device_get(i);
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+        if( dev->type == PARSEC_DEV_CUDA )
+            nb_gpus++;
+#endif
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+        if( dev->type == PARSEC_DEV_HIP )
+            nb_gpus++;
+#endif
+    }
+    if(nb_gpus > 0) {
+        gpu_devices = (parsec_device_module_t **)malloc(sizeof(parsec_device_module_t*)*nb_gpus);
+        nb_gpus = 0;
+        for(unsigned int i = 0; i < parsec_nb_devices; i++) {
+            parsec_device_module_t *dev = parsec_mca_device_get(i);
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+            if( dev->type == PARSEC_DEV_CUDA) {
+                gpu_devices[nb_gpus] = dev;
+                nb_gpus++;
+            }
+#endif
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+            if( dev->type == PARSEC_DEV_HIP) {
+                gpu_devices[nb_gpus] = dev;
+                nb_gpus++;
+            }
+#endif
+        }
+    } else {
+        if(0 == rank) {
+            fprintf(stderr, "Warning: test disabled because there is no GPU detected with this run\n");
+        }
+        parsec_fini(&parsec);
+        MPI_Finalize();
+        return EXIT_SUCCESS; /* So that useless tests don't make the CI fail */
+    }
+
+    nb = 3 * NLOOP * world * nb_gpus;
+
+    parsec_taskpool_t *dtd_tp = parsec_dtd_taskpool_new();
+
+    adt = parsec_dtd_create_arena_datatype(parsec, &TILE_FULL);
+    parsec_add2arena( adt, parsec_datatype_int32_t, PARSEC_MATRIX_FULL, 0,
+                      nb, 1, nb, PARSEC_ARENA_ALIGNMENT_SSE, -1);
+
+    /* Registering the dtd_handle with PARSEC context */
+    rc = parsec_context_add_taskpool( parsec, dtd_tp );
+    PARSEC_CHECK_ERROR(rc, "parsec_context_add_taskpool");
+    rc = parsec_context_start(parsec);
+    PARSEC_CHECK_ERROR(rc, "parsec_context_start");
+
+    parsec_task_class_t *ping_tc = parsec_dtd_create_task_class(dtd_tp, "ping",
+                                                                sizeof(int), PARSEC_VALUE | PARSEC_AFFINITY | PARSEC_PROFILE_INFO, "rank",
+                                                                PASSED_BY_REF, PARSEC_OUTPUT | TILE_FULL,
+                                                                sizeof(int), PARSEC_VALUE | PARSEC_PROFILE_INFO, "nb",
+                                                                PARSEC_DTD_ARG_END);
+    parsec_dtd_task_class_add_chore(dtd_tp, ping_tc, PARSEC_DEV_CPU, cpu_ping);
+    parsec_task_class_t *pong_tc = parsec_dtd_create_task_class(dtd_tp, "pong",
+                                                                sizeof(int), PARSEC_VALUE | PARSEC_AFFINITY | PARSEC_PROFILE_INFO, "rank",
+                                                                PASSED_BY_REF, PARSEC_INOUT | TILE_FULL | PARSEC_PUSHOUT,
+                                                                sizeof(int), PARSEC_VALUE | PARSEC_PROFILE_INFO, "idx",
+                                                                PARSEC_DTD_ARG_END);
+    parsec_dtd_task_class_add_chore(dtd_tp, pong_tc, PARSEC_DEV_CPU, cpu_pong);
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+    parsec_dtd_task_class_add_chore(dtd_tp, pong_tc, PARSEC_DEV_CUDA, cuda_pong);
+#endif
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+    parsec_dtd_task_class_add_chore(dtd_tp, pong_tc, PARSEC_DEV_HIP, hip_pong);
+#endif
+
+    parsec_dtd_tile_t *tile = parsec_dtd_tile_new(dtd_tp, 0);
+    parsec_dtd_insert_task_with_task_class(dtd_tp, ping_tc, 0,
+                                            PARSEC_DEV_CPU,
+                                            PARSEC_DTD_EMPTY_FLAG, &rank,
+                                            PARSEC_DTD_EMPTY_FLAG, tile,
+                                            PARSEC_DTD_EMPTY_FLAG, &nb,
+                                            PARSEC_DTD_ARG_END);
+
+    int idx = 0;
+    for(int loop = 0; loop < NLOOP; loop++) {
+        for(int rank = 0; rank < world; rank++) {
+            for(int dev = 0; dev < nb_gpus; dev++) {
+                parsec_dtd_insert_task_with_task_class(dtd_tp, pong_tc, 0,
+                                               PARSEC_DEV_CPU,
+                                               PARSEC_DTD_EMPTY_FLAG, &rank,
+                                               PARSEC_DTD_EMPTY_FLAG, tile,
+                                               PARSEC_DTD_EMPTY_FLAG, &idx,
+                                               PARSEC_DTD_ARG_END);
+                idx+=1;
+                parsec_advise_data_on_device(tile->data_copy->original, gpu_devices[dev]->device_index, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
+                parsec_dtd_insert_task_with_task_class(dtd_tp, pong_tc, 0,
+                                               gpu_devices[dev]->type,
+                                               PARSEC_DTD_EMPTY_FLAG, &rank,
+                                               PARSEC_DTD_EMPTY_FLAG, tile,
+                                               PARSEC_DTD_EMPTY_FLAG, &idx,
+                                               PARSEC_DTD_ARG_END);
+                idx+=1;
+                parsec_dtd_insert_task_with_task_class(dtd_tp, pong_tc, 0,
+                                               gpu_devices[dev]->type,
+                                               PARSEC_DTD_EMPTY_FLAG, &rank,
+                                               PARSEC_DTD_EMPTY_FLAG, tile,
+                                               PARSEC_DTD_EMPTY_FLAG, &idx,
+                                               PARSEC_DTD_ARG_END);
+                idx+=1;
+            }
+        }
+    }
+
+    /* Rank 0 keeps the tile alive for checking, the others can flush it out */
+    if(0 == rank) {
+        PARSEC_OBJ_RETAIN(tile);
+    }
+    parsec_dtd_data_flush(dtd_tp, tile);
+
+    rc = parsec_taskpool_wait( dtd_tp );
+    PARSEC_CHECK_ERROR(rc, "parsec_taskpool_wait");
+    rc = parsec_context_wait(parsec);
+    PARSEC_CHECK_ERROR(rc, "parsec_context_wait");
+
+    if(0 == rank) {
+        int32_t *data = parsec_data_copy_get_ptr(tile->data_copy);
+        for(int n = 0; n < nb; n++) {
+            if(2*n != data[n]) {
+                printf("Final value at index %d is %d, expected %d\n", n, data[n], 2*n);
+                nb_errors++;
+            }
+        }
+        PARSEC_OBJ_RELEASE(tile);
+    }
+
+    parsec_dtd_task_class_release(dtd_tp, ping_tc);
+
+    parsec_del2arena(adt);
+    PARSEC_OBJ_RELEASE(adt->arena);
+    parsec_dtd_destroy_arena_datatype(parsec, TILE_FULL);
+
+    parsec_taskpool_free( dtd_tp );
+    parsec_fini(&parsec);
+
+#ifdef PARSEC_HAVE_MPI
+    MPI_Finalize();
+#endif
+
+    if(nb_errors > 0)
+        return EXIT_FAILURE;
+    return EXIT_SUCCESS;
+}
diff --git a/tests/runtime/cuda/get_best_device_check.jdf b/tests/runtime/gpu/get_best_device_check.jdf
similarity index 100%
rename from tests/runtime/cuda/get_best_device_check.jdf
rename to tests/runtime/gpu/get_best_device_check.jdf
diff --git a/tests/runtime/cuda/nvlink.jdf b/tests/runtime/gpu/nvlink.jdf
similarity index 100%
rename from tests/runtime/cuda/nvlink.jdf
rename to tests/runtime/gpu/nvlink.jdf
diff --git a/tests/runtime/cuda/nvlink_main.c b/tests/runtime/gpu/nvlink_main.c
similarity index 100%
rename from tests/runtime/cuda/nvlink_main.c
rename to tests/runtime/gpu/nvlink_main.c
diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/gpu/nvlink_wrapper.c
similarity index 100%
rename from tests/runtime/cuda/nvlink_wrapper.c
rename to tests/runtime/gpu/nvlink_wrapper.c
diff --git a/tests/runtime/cuda/nvlink_wrapper.h b/tests/runtime/gpu/nvlink_wrapper.h
similarity index 100%
rename from tests/runtime/cuda/nvlink_wrapper.h
rename to tests/runtime/gpu/nvlink_wrapper.h
diff --git a/tests/runtime/gpu/ping_kernel.cu b/tests/runtime/gpu/ping_kernel.cu
new file mode 100644
index 000000000..36ee6f766
--- /dev/null
+++ b/tests/runtime/gpu/ping_kernel.cu
@@ -0,0 +1,13 @@
+extern "C" {
+void cuda_pong_kernel(int *dev_data, int idx);
+}
+
+__global__ void pong_kernel(int *dev_data, int idx)
+{
+    dev_data[idx] += idx;
+}
+
+void cuda_pong_kernel(int *dev_data, int idx)
+{
+    pong_kernel<<<1, 1>>>(dev_data, idx);
+}
diff --git a/tests/runtime/gpu/ping_kernel.hip.c b/tests/runtime/gpu/ping_kernel.hip.c
new file mode 100644
index 000000000..7ee3ec3d1
--- /dev/null
+++ b/tests/runtime/gpu/ping_kernel.hip.c
@@ -0,0 +1,15 @@
+#include <hip/hip_runtime.h>
+
+extern "C" {
+void hip_pong_kernel(int *dev_data, int idx);
+}
+
+__global__ void pong_kernel(int *dev_data, int idx)
+{
+    dev_data[idx] += idx;
+}
+
+void hip_pong_kernel(int *dev_data, int idx)
+{
+    hipLaunchKernelGGL(pong_kernel, 1, 1, 0, 0, dev_data, idx);
+}
diff --git a/tests/runtime/gpu/ptg_pingpong.jdf b/tests/runtime/gpu/ptg_pingpong.jdf
new file mode 100644
index 000000000..aead74fa0
--- /dev/null
+++ b/tests/runtime/gpu/ptg_pingpong.jdf
@@ -0,0 +1,317 @@
+extern "C" %{
+/*
+ * Copyright (c) 2023     The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ */
+
+#include "parsec/parsec_config.h"
+#include "parsec/utils/mca_param.h"
+#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <sys/time.h>
+#include <mpi.h>
+
+#include "ptg_pingpong.h"
+
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+extern void cuda_pong_kernel(int *dev_data, int idx);
+#endif
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+extern void hip_pong_kernel(int *dev_data, int idx);
+#endif
+
+int rank_of_token(int k);
+int device_of_token(int k);
+
+%}
+
+/*
+ * Globals
+ */
+
+
+dist     [type = "parsec_tiled_matrix_t*"]
+NB_TOKEN [type = "int"]
+NB_ERR   [type = "int*"]
+
+INIT(k)
+  k = 0 .. 0
+
+: dist(0, 0)
+
+WRITE T <- NEW
+        -> T TOKEN_CPU(0)
+
+BODY 
+{
+    int *tile = (int*)T;
+    for(int i = 0; i < 2*NB_TOKEN; i++) tile[i] = i;
+}
+END
+
+TOKEN_CPU(k)
+ k = 0 .. NB_TOKEN-1
+ r = %{ return rank_of_token(k); %}
+
+: dist(r, 0)
+
+RW T <- k == 0 ? T INIT(0) : T TOKEN_GPU(k-1, 1)
+     -> T TOKEN_GPU(k, 0)
+
+BODY
+{
+    int *tile = (int*)T;
+    tile[2*k] += 2*k;
+    tile[2*k+1] += 2*k+1;
+}
+END
+
+LOAD_DIST(r, d)
+ r = 0 .. dist->lmt-1
+ d = 0 .. dist->lnt-1
+ kn = %{ int k; int n = 0; for(k = 0; k < NB_TOKEN; k++) { if( rank_of_token(k) == r && device_of_token(k) == d ) { n++;} } return n; %}
+: dist(r, d)
+
+D <- dist(r, d)
+  -> [ik = 0 .. kn-1] 1 ? D TOKEN_GPU(%{ int k; 
+                                   int i = 0; 
+                                   for(k = 0; k < NB_TOKEN; k++) { 
+                                     if(rank_of_token(k) == r && device_of_token(k) == d) { 
+                                        if(ik == i) 
+                                          return k; 
+                                        i++; 
+                                     } 
+                                   } 
+                                   assert(0); 
+                                   return -1; 
+                                 %}, 0..1)
+
+BODY
+{
+    /* nothing */
+}   
+END
+
+TOKEN_GPU(k, l)
+ k = 0 .. NB_TOKEN-1
+ l = 0 .. 1
+ r = %{ return rank_of_token(k); %}
+ d = %{ return device_of_token(k); %}
+
+: dist(r, d)
+
+RW D <- D LOAD_DIST(r, d)
+     -> dist(r, d)
+RW T <- l == 0 ? T TOKEN_CPU(k)
+     <- l == 1 ? T TOKEN_GPU(k, 0)
+     -> l == 0 ? T TOKEN_GPU(k, 1)
+     -> l == 1 && k < NB_TOKEN-1 ? T TOKEN_CPU(k+1)
+     -> l == 1 && k == NB_TOKEN-1 ? T CHECK(0)
+
+BODY [type=CUDA]
+{
+    cuda_pong_kernel(T, 2*k+l);
+}
+END
+
+BODY [type=HIP]
+{
+    hip_pong_kernel(T, 2*k+l);
+}
+END
+
+CHECK(k)
+ k = 0 .. 0
+
+: dist(0, 0)
+
+READ T <- T TOKEN_GPU(NB_TOKEN-1, 1)
+
+BODY
+{
+    int *tile = (int*)T;
+    for(int i = 0; i < 2*NB_TOKEN; i++) {
+        if(tile[i] != 3*i) {
+            fprintf(stderr, "Error in element %d: expecting %d got %d\n", i, 3*i, tile[i]);
+            (*NB_ERR)++;
+        }
+    }
+    if(*NB_ERR) {
+        fprintf(stderr, "Expected: ");
+        for(int i = 0; i < 2*NB_TOKEN; i++) {
+            fprintf(stderr, "%3d ", 3*i);
+        }
+        fprintf(stderr, "\nReceived: ");
+        for(int i = 0; i < 2*NB_TOKEN; i++) {
+            fprintf(stderr, "%3d ", tile[i]);
+        }
+        fprintf(stderr, "\n");
+    }
+}
+END
+
+extern "C" %{
+
+#define NLOOP 8
+
+typedef struct {
+    int rank;
+    int device;
+} token_t;
+
+static token_t *tokens;
+
+int rank_of_token(int k)
+{
+    return tokens[k].rank;
+}
+
+int device_of_token(int k)
+{
+    return tokens[k].device;
+}
+
+int main( int argc, char** argv )
+{
+    parsec_ptg_pingpong_taskpool_t* tp;
+    parsec_matrix_block_cyclic_t dist;
+    parsec_arena_datatype_t adt;
+    parsec_datatype_t dt;
+    parsec_context_t *parsec;
+    int nb_gpus = 0;
+    int nb_token, rc;
+    int world = 1;
+    int rank = 0;
+    int nb_err = 0;
+
+#ifdef PARSEC_HAVE_MPI
+    {
+        int provided;
+        MPI_Init_thread(NULL, NULL, MPI_THREAD_SERIALIZED, &provided);
+        MPI_Comm_size(MPI_COMM_WORLD, &world);
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+#endif
+
+    parsec = parsec_init(-1, &argc, &argv);
+    if( NULL == parsec ) {
+       exit(-1);
+    }
+
+    /**
+     * Build the gpu_devices that stores the parsec index of the different
+     * GPUs available, and compute the number of GPUs available on this node
+     */
+    int *gpu_devices;
+    for(unsigned int i = 0; i < parsec_nb_devices; i++) {
+        parsec_device_module_t *dev = parsec_mca_device_get(i);
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+        if( dev->type == PARSEC_DEV_CUDA )
+            nb_gpus++;
+#endif
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+        if( dev->type == PARSEC_DEV_HIP )
+            nb_gpus++;
+#endif
+    }
+    if(0 == nb_gpus) {
+        if(0 == rank) {
+            fprintf(stderr, "Warning: test disabled because there is no GPU detected with this run\n");
+        }
+        parsec_fini(&parsec);
+        MPI_Finalize();
+        return EXIT_SUCCESS; /* So that useless tests don't make the CI fail */
+    }
+    gpu_devices = (int*)malloc(sizeof(int)*nb_gpus);
+    nb_gpus = 0;
+    for(unsigned int i = 0; i < parsec_nb_devices; i++) {
+        parsec_device_module_t *dev = parsec_mca_device_get(i);
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+        if( dev->type == PARSEC_DEV_CUDA )
+            gpu_devices[nb_gpus++] = i;
+#endif
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+        if( dev->type == PARSEC_DEV_HIP )
+            gpu_devices[nb_gpus++] = i;
+#endif
+    }
+
+    /**
+     * Build the token circulation plan
+     */
+    nb_token = NLOOP*nb_gpus*world;
+    tokens = (token_t*)malloc(nb_token * sizeof(token_t));
+    nb_token = 0;
+    for(int l = 0; l < NLOOP; l++) {
+        for(int r = 0; r < world; r++) {
+            for(int d = 0; d < nb_gpus; d++) {
+                tokens[nb_token].rank = r;
+                tokens[nb_token].device = gpu_devices[d];
+                nb_token++;
+            }
+        }
+    }
+
+    /**
+     * Create the distribution that is used to force things to run on a specific device
+     * To avoid creating an additional datatype, we allocate each tile of the dist matrix
+     * big enough to store the token. But that matrix will not store tokens.
+     */
+    parsec_matrix_block_cyclic_init(&dist, PARSEC_MATRIX_FLOAT, PARSEC_MATRIX_TILE,
+                                    rank, 
+                                    2*nb_token, 1,
+                                    world*2*nb_token, parsec_nb_devices,
+                                    0, 0,
+                                    world*2*nb_token, parsec_nb_devices,
+                                    1, 1,
+                                    1, 1,
+                                    0, 0);
+    assert((size_t)parsec_datadist_getsizeoftype(dist.super.mtype) >= sizeof(int));
+    size_t distSize = (size_t)dist.super.nb_local_tiles *
+                      (size_t)dist.super.bsiz *
+                      (size_t)parsec_datadist_getsizeoftype(dist.super.mtype);
+    dist.mat = parsec_data_allocate(distSize);
+    memset(dist.mat, 0, distSize);
+    parsec_translate_matrix_type(PARSEC_MATRIX_FLOAT, &dt);
+    parsec_add2arena(&adt, dt, PARSEC_MATRIX_FULL, 1, dist.super.mb, dist.super.nb, dist.super.mb, PARSEC_ARENA_ALIGNMENT_SSE, -1);
+
+    /**
+     * Set which device is preferred for each tile of dist, so the PTG / token plan can
+     * define which device will hold which token
+     */
+    for(int d = 0; d < (int)parsec_nb_devices; d++) {
+        parsec_data_t *dta = dist.super.super.data_of(&dist.super.super, rank, d);
+        parsec_advise_data_on_device(dta, d, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
+    }
+
+    /* Start the PaRSEC engine */
+    rc = parsec_context_start(parsec);
+    PARSEC_CHECK_ERROR(rc, "parsec_context_start");
+
+    tp = parsec_ptg_pingpong_new( (parsec_tiled_matrix_t *)&dist, nb_token, &nb_err );
+    assert( NULL != tp );
+    tp->arenas_datatypes[PARSEC_ptg_pingpong_DEFAULT_ADT_IDX] = adt;
+
+    rc = parsec_context_add_taskpool( parsec, (parsec_taskpool_t*)tp );
+    PARSEC_CHECK_ERROR(rc, "parsec_context_add_taskpool");
+    rc = parsec_context_wait(parsec);
+    parsec_taskpool_free(&tp->super);
+    PARSEC_CHECK_ERROR(rc, "parsec_context_wait");
+
+    free(dist.mat);
+    parsec_del2arena( & adt );
+
+    parsec_fini( &parsec);
+
+#ifdef PARSEC_HAVE_MPI
+    MPI_Finalize();
+#endif
+
+    return nb_err == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
+
+%}
diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/gpu/stage_custom.jdf
similarity index 100%
rename from tests/runtime/cuda/stage_custom.jdf
rename to tests/runtime/gpu/stage_custom.jdf
diff --git a/tests/runtime/cuda/stage_main.c b/tests/runtime/gpu/stage_main.c
similarity index 100%
rename from tests/runtime/cuda/stage_main.c
rename to tests/runtime/gpu/stage_main.c
diff --git a/tests/runtime/cuda/stress.jdf b/tests/runtime/gpu/stress.jdf
similarity index 100%
rename from tests/runtime/cuda/stress.jdf
rename to tests/runtime/gpu/stress.jdf
diff --git a/tests/runtime/cuda/stress_main.c b/tests/runtime/gpu/stress_main.c
similarity index 100%
rename from tests/runtime/cuda/stress_main.c
rename to tests/runtime/gpu/stress_main.c
diff --git a/tests/runtime/cuda/stress_wrapper.c b/tests/runtime/gpu/stress_wrapper.c
similarity index 100%
rename from tests/runtime/cuda/stress_wrapper.c
rename to tests/runtime/gpu/stress_wrapper.c
diff --git a/tests/runtime/cuda/stress_wrapper.h b/tests/runtime/gpu/stress_wrapper.h
similarity index 100%
rename from tests/runtime/cuda/stress_wrapper.h
rename to tests/runtime/gpu/stress_wrapper.h
diff --git a/tests/runtime/cuda/testing_get_best_device.c b/tests/runtime/gpu/testing_get_best_device.c
similarity index 100%
rename from tests/runtime/cuda/testing_get_best_device.c
rename to tests/runtime/gpu/testing_get_best_device.c