diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index a65f13cc17..0c3cb61f9f 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -13,8 +13,10 @@ jobs:
         os : [ macos-latest, ubuntu-20.04 ]
         include:
           - os: ubuntu-20.04
-            cxx: /usr/bin/g++-9
+            cc: /usr/bin/gcc-10
+            cxx: /usr/bin/g++-10
           - os: macos-latest
+            cc: clang
             cxx: clang++
 
     name: "${{ matrix.os }}: ${{ matrix.cxx }} ${{ matrix.build_type }}"
@@ -36,6 +38,7 @@ jobs:
         -DMPIEXEC_PREFLAGS='--bind-to;none;--allow-run-as-root'
         -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/install
         -DTTG_EXAMPLES=ON
+        -DCMAKE_CXX_STANDARD=20
 
     steps:
     - uses: actions/checkout@v2
@@ -50,7 +53,7 @@ jobs:
         wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
         sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
         sudo apt-get update
-        sudo apt-get -y install ninja-build g++-9 liblapack-dev libboost-dev libboost-serialization-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake
+        sudo apt-get -y install ninja-build g++-10 liblapack-dev libboost-dev libboost-serialization-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake
 
     - name: Create Build Environment
       # Some projects don't allow in-source building, so create a separate build directory
@@ -99,7 +102,8 @@ jobs:
       # Note the current convention is to use the -S and -B options here to specify source 
       # and build directories, but this is only available with CMake 3.13 and higher.  
       # The CMake binaries on the Github Actions machines are (as of this writing) 3.12
-      run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG
+      run: |
+        cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log)
 
     - name: Build
       working-directory: ${{github.workspace}}/build
@@ -124,7 +128,7 @@ jobs:
       working-directory: ${{github.workspace}}/build
       shell: bash
       run: |
-        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/main -B test_install_devsamp -DCMAKE_PREFIX_PATH=${{github.workspace}}/install
+        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/main -B test_install_devsamp -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp/CMakeFiles/CMakeOutput.log && cat test_install_devsamp/CMakeFiles/CMakeError.log)
         cmake --build test_install_devsamp
         cmake -E make_directory test_install_userexamples
         cat > test_install_userexamples/CMakeLists.txt <<EOF
@@ -136,7 +140,7 @@ jobs:
         add_ttg_executable(iterative $GITHUB_WORKSPACE/doc/dox/user/examples/iterative.cc NOT_EXCLUDE_FROM_ALL)
         add_ttg_executable(distributed $GITHUB_WORKSPACE/doc/dox/user/examples/distributed.cc NOT_EXCLUDE_FROM_ALL)
         EOF
-        cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install
+        cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_userexamples/CMakeFiles/CMakeOutput.log && cat test_install_userexamples/CMakeFiles/CMakeError.log)
         cmake --build test_install_userexamples/build
 
     - name: Build+Deploy Dox
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 551e543a86..eb4fc1ec0b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,7 +29,12 @@ project(ttg
     LANGUAGES CXX
     HOMEPAGE_URL "https://tesseorg.github.io/ttg/")
 
-set(CMAKE_CXX_EXTENSIONS OFF)
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+    set(CMAKE_CXX_STANDARD "20" CACHE STRING "The C++ standard")
+endif()
+if (NOT CMAKE_CXX_EXTENSIONS)
+  set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "The C++ standard extensions allowed?")
+endif()
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules/")
 include(GNUInstallDirs)
@@ -78,6 +83,28 @@ endif (BUILD_TESTING)
 include(FindOrFetchBoost)
 # Cereal
 #include(FindOrFetchCereal)
+# C++ coroutines
+find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental)
+
+##########################
+#### CUDA -- CUDA language must be added before PaRSEC detects CUDA
+##########################
+option(TTG_DISABLE_CUDA "True iff support for CUDA files is disabled, even if it is possible" OFF)
+if( NOT TTG_DISABLE_CUDA )
+  include(CheckLanguage)
+  check_language(CUDA)
+  if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+  endif(CMAKE_CUDA_COMPILER)
+  set(TTG_HAVE_CUDA ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if TTG supports compiling .cu files")
+  find_package(CUDAToolkit)
+  if (TARGET CUDA::cudart)
+    set(TTG_HAVE_CUDART True CACHE BOOL "True support for CUDART was found.")
+  endif(TARGET CUDA::cudart)
+else( NOT TTG_DISABLE_CUDA )
+  set(TTG_HAVE_CUDA OFF CACHE BOOL "Support for CUDA was disabled")
+  set(TTG_HAVE_CUDART OFF CACHE BOOL "Support for CUDA was disabled")
+endif( NOT TTG_DISABLE_CUDA )
 
 ##########################
 #### prerequisite runtimes
@@ -154,6 +181,7 @@ configure_package_config_file(cmake/ttg-config.cmake.in
 install(FILES
         "${PROJECT_SOURCE_DIR}/cmake/modules/AddTTGExecutable.cmake"
         "${PROJECT_SOURCE_DIR}/cmake/modules/AddTTGTestExecutable.cmake"
+        "${PROJECT_SOURCE_DIR}/cmake/modules/FindCXXStdCoroutine.cmake"
         DESTINATION "${CMAKE_INSTALL_CMAKEDIR}/modules"
         COMPONENT ttg-config)
 
diff --git a/cmake/modules/AddCUDAToolkit.cmake b/cmake/modules/AddCUDAToolkit.cmake
new file mode 100644
index 0000000000..5d682345cd
--- /dev/null
+++ b/cmake/modules/AddCUDAToolkit.cmake
@@ -0,0 +1,9 @@
+find_package(CUDAToolkit)
+set(TTG_HAVE_CUDA ${CUDAToolkit_FOUND} CACHE BOOL "True if TTG supports CUDA")
+if (TTG_HAVE_CUDA)
+    check_language(CUDA)
+    if(CMAKE_CUDA_COMPILER)
+        enable_language(CUDA)
+    endif(CMAKE_CUDA_COMPILER)
+    set(TTG_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if TTG supports compiling .cu files")
+endif(TTG_HAVE_CUDA)
\ No newline at end of file
diff --git a/cmake/modules/AddTTGLibrary.cmake b/cmake/modules/AddTTGLibrary.cmake
index 30fe7fab02..4b915438d7 100644
--- a/cmake/modules/AddTTGLibrary.cmake
+++ b/cmake/modules/AddTTGLibrary.cmake
@@ -102,7 +102,14 @@ macro(add_ttg_library)
     # set_target_properties(${_library} PROPERTIES PUBLIC_HEADER "${ADD_TTG_LIBRARY_PUBLIC_HEADER}")
     # install manually
     foreach ( file ${ADD_TTG_LIBRARY_PUBLIC_HEADER} )
-        file(RELATIVE_PATH _rel_file_path "${PROJECT_SOURCE_DIR}/ttg" "${file}")
+        # N.B. some files are in the build tree
+        if ("${file}" MATCHES "^${PROJECT_SOURCE_DIR}/ttg")
+            file(RELATIVE_PATH _rel_file_path "${PROJECT_SOURCE_DIR}/ttg" "${file}")
+        elseif("${file}" MATCHES "^${PROJECT_BINARY_DIR}/ttg")
+            file(RELATIVE_PATH _rel_file_path "${PROJECT_BINARY_DIR}/ttg" "${file}")
+        else()
+            message(FATAL_ERROR "AddTTGLibrary: could not deduce install location for public header ${file} of component ${_library}")
+        endif()
         get_filename_component( dir "${_rel_file_path}" DIRECTORY )
         install( FILES ${file} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dir}" COMPONENT ${_library})
     endforeach()
diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index d346deb396..56f024142f 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -6,5 +6,5 @@ set(TTG_TRACKED_BOOST_VERSION 1.66)
 set(TTG_TRACKED_CATCH2_VERSION 2.13.1)
 set(TTG_TRACKED_CEREAL_VERSION 1.3.0)
 set(TTG_TRACKED_MADNESS_TAG 687ac4f5308c82de04ea0f803f57417fa92713d5)
-set(TTG_TRACKED_PARSEC_TAG cb9322046c1f856fbca5804a9f96b83bbe2e676b)
+set(TTG_TRACKED_PARSEC_TAG 304e6f9d092e84fe58e6c7a8ea9d4e8b14794e5e) 
 set(TTG_TRACKED_BTAS_TAG ab866a760b72ff23053266bf46b87f19d3df44b7)
diff --git a/cmake/modules/FindCXXStdCoroutine.cmake b/cmake/modules/FindCXXStdCoroutine.cmake
new file mode 100644
index 0000000000..3a86d7a4d3
--- /dev/null
+++ b/cmake/modules/FindCXXStdCoroutine.cmake
@@ -0,0 +1,195 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+# This is copied from:
+#   https://github.com/vector-of-bool/CMakeCM/blob/master/modules/FindFilesystem.cmake
+
+#[=======================================================================[.rst:
+
+FindCXXStdCoroutine
+##############
+
+This module supports the C++20 standard library's coroutine utilities. Link your target to the
+:imp-target:`std::coroutine` imported target to provide standard C++ coroutine API.
+
+Options
+*******
+
+The ``COMPONENTS`` argument to this module supports the following values:
+
+.. find-component:: Experimental
+    :name: coro.Experimental
+
+    Allows the module to find the "experimental" version of the
+    Coroutine library. This is the library that should be used with the
+    ``std::experimental::coroutine`` namespace.
+
+.. find-component:: Final
+    :name: coro.Final
+
+    Finds the final C++20 standard version of the coroutine library.
+
+If no components are provided, behaves as if the
+:find-component:`coro.Final` component was specified.
+
+If both :find-component:`coro.Experimental` and :find-component:`coro.Final` are
+provided, first looks for ``Final``, and falls back to ``Experimental`` in case
+of failure. If ``Final`` is found, :imp-target:`std::coroutine` and all
+:ref:`variables <coro.variables>` will refer to the ``Final`` version.
+
+
+Imported Targets
+****************
+
+.. imp-target:: std::coroutine
+
+    The ``std::coroutine`` imported target is defined when any requested
+    version of the C++ coroutine library has been found, whether it is
+    *Experimental* or *Final*.
+
+    If no version of the coroutine library is available, this target will not
+    be defined.
+
+    .. note::
+        This target has ``cxx_std_20`` as an ``INTERFACE``
+        :ref:`compile language standard feature <req-lang-standards>`. Linking
+        to this target will automatically enable C++20 if no later standard
+        version is already required on the linking target.
+
+
+.. coro.variables:
+
+Variables
+*********
+
+.. variable:: CXX_COROUTINE_COMPONENT
+
+    Set to ``Final`` when the :find-component:`coro.Final` version of C++
+    coroutine library was found, ``Experimental`` when
+    the :find-component:`coro.Experimental` version of C++
+    coroutine library was found, otherwise not defined.
+
+.. variable:: CXX_COROUTINE_HAVE_CORO
+
+    Set to ``TRUE`` when a coroutine header was found.
+
+.. variable:: CXX_COROUTINE_HEADER
+
+    Set to either ``coroutine`` or ``experimental/coroutine`` depending on
+    whether :find-component:`coro.Final` or :find-component:`coro.Experimental` was
+    found.
+
+.. variable:: CXX_COROUTINE_NAMESPACE
+
+    Set to either ``std::coroutine`` or ``std::experimental::coroutine``
+    depending on whether :find-component:`coro.Final` or
+    :find-component:`coro.Experimental` was found.
+
+
+Examples
+********
+
+Using `find_package(Coroutine)` with no component arguments:
+
+.. code-block:: cmake
+
+    find_package(Coroutine REQUIRED)
+
+    add_executable(my-program main.cpp)
+    target_link_libraries(my-program PRIVATE std::coroutine)
+
+
+#]=======================================================================]
+
+
+if(TARGET std::coroutine)
+  # This module has already been processed. Don't do it again.
+  return()
+endif()
+
+include(CMakePushCheckState)
+include(CheckIncludeFileCXX)
+include(CheckCXXSourceCompiles)
+
+cmake_push_check_state()
+
+set(CMAKE_REQUIRED_QUIET ${CXXStdCoroutine_FIND_QUIETLY})
+
+# Normalize and check the component list we were given
+set(CXXStdCoroutines_want_components ${CXXStdCoroutine_FIND_COMPONENTS})
+if(CXXStdCoroutine_FIND_COMPONENTS STREQUAL "")
+  set(CXXStdCoroutines_want_components Final)
+endif()
+
+# Warn on any unrecognized components
+set(CXXStdCoroutines_extra_components ${CXXStdCoroutines_want_components})
+list(REMOVE_ITEM CXXStdCoroutines_extra_components Final Experimental)
+foreach(component IN LISTS CXXStdCoroutines_extra_components)
+  message(WARNING "Extraneous find_package component for CXXStdCoroutine: ${component}")
+endforeach()
+
+# clang may need to use -stdlib=c++ to have coroutines
+# gcc/libstdc++ needs -fcoroutines
+set(CXXStdCoroutines_find_options "" "-stdlib=libc++" "-fcoroutines")
+set(CXXStdCoroutines_std_options "" "-std=c++20" "-std=c++2a")
+set(CXXStdCoroutines_want_components_ordered "${CXXStdCoroutines_want_components}")
+list(SORT CXXStdCoroutines_want_components_ordered ORDER DESCENDING)  # Final before Experimental
+
+foreach(component IN LISTS CXXStdCoroutines_want_components_ordered)
+  if(component STREQUAL "Final")
+    set(_coro_header coroutine)
+    set(_coro_namespace std)
+  else()
+    set(_coro_header experimental/coroutine)
+    set(_coro_namespace std::experimental)
+  endif()
+  foreach(option IN LISTS CXXStdCoroutines_find_options)
+    foreach(stdoption IN LISTS CXXStdCoroutines_std_options)
+      cmake_push_check_state()
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${option} ${stdoption}")
+
+      string(CONFIGURE [[
+        #include <@_coro_header@>
+
+        int main() {
+            auto x = @_coro_namespace@::suspend_always{};
+            return 0;
+        }
+      ]] code @ONLY)
+
+      check_cxx_source_compiles("${code}" HAVE_USABLE_${_coro_header})
+      mark_as_advanced(HAVE_USABLE_${_coro_header})
+      cmake_pop_check_state()
+      if(HAVE_USABLE_${_coro_header})
+        add_library(std::coroutine INTERFACE IMPORTED GLOBAL)
+        target_compile_features(std::coroutine INTERFACE cxx_std_20)
+        if (option)
+          target_compile_options(std::coroutine INTERFACE "${option}")
+        endif()
+        set(CXX_COROUTINE_COMPONENT "${component}" CACHE STRING "The component of CXXStdCoroutine package found")
+        # break out of this loop
+        break()
+      else()
+        unset(HAVE_USABLE_${_coro_header} CACHE)
+      endif()
+    endforeach()  # stdoption
+    if (TARGET std::coroutine)
+      break()
+    endif()
+  endforeach()  # option
+  if (TARGET std::coroutine)
+    break()
+  endif()
+endforeach() # components
+
+set(CXX_COROUTINE_HAVE_CORO ${HAVE_USABLE_${_coro_header}} CACHE BOOL "TRUE if we have usable C++ coroutine headers")
+set(CXX_COROUTINE_HEADER ${_coro_header} CACHE STRING "The header that should be included to obtain the coroutine APIs")
+set(CXX_COROUTINE_NAMESPACE ${_coro_namespace} CACHE STRING "The C++ namespace that contains the coroutine APIs")
+
+cmake_pop_check_state()
+
+set(CXXStdCoroutine_FOUND ${HAVE_USABLE_${_coro_header}} CACHE BOOL "TRUE if we have usable C++ coroutine headers" FORCE)
+
+if(CXXStdCoroutine_FIND_REQUIRED AND NOT TARGET std::coroutine)
+  message(FATAL_ERROR "Cannot discover std::coroutine headers and/or compile simple program using std::coroutine")
+endif()
diff --git a/cmake/modules/FindCXXStdExecution.cmake b/cmake/modules/FindCXXStdExecution.cmake
index 059368d1f1..acb6b1730d 100644
--- a/cmake/modules/FindCXXStdExecution.cmake
+++ b/cmake/modules/FindCXXStdExecution.cmake
@@ -9,8 +9,8 @@
 FindCXXStdExecution
 ##############
 
-This module supports the C++17 standard library's execution utilities. Use the
-:imp-target:`std::execution` imported target to
+This module supports the C++17 standard library's execution utilities. Link your target to the
+:imp-target:`std::execution` imported target to provide standard C++ execution API.
 
 Imported Targets
 ****************
@@ -69,9 +69,6 @@ cmake_push_check_state()
 
 set(CMAKE_REQUIRED_QUIET ${CXXStdExecution_FIND_QUIETLY})
 
-# All of our tests required C++17 or later
-set(CMAKE_CXX_STANDARD 17)
-
 set(CXXStdExecution_FOUND FALSE)
 
 # We have execution header, but how do we use it? Do link checks
@@ -87,7 +84,7 @@ string(CONFIGURE [[
   }
   ]] code @ONLY)
 
-# Try to compile a simple filesystem program without any linker flags
+# Try to compile a simple execution program without any linker flags
 check_cxx_source_compiles("${code}" CXX_EXECUTION_NO_LINK_NEEDED)
 
 set(CXXStdExecution_CAN_LINK ${CXX_EXECUTION_NO_LINK_NEEDED})
diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake
index 1d18b66b5b..e2854e591a 100644
--- a/cmake/modules/FindOrFetchPARSEC.cmake
+++ b/cmake/modules/FindOrFetchPARSEC.cmake
@@ -16,7 +16,7 @@ if (NOT TARGET PaRSEC::parsec)
 
   FetchContent_Declare(
           PARSEC
-          GIT_REPOSITORY      https://github.com/TESSEOrg/parsec.git
+          GIT_REPOSITORY      https://github.com/therault/parsec.git
           GIT_TAG             ${TTG_TRACKED_PARSEC_TAG}
   )
   FetchContent_MakeAvailable(PARSEC)
diff --git a/cmake/ttg-config.cmake.in b/cmake/ttg-config.cmake.in
index b00a9c692f..41663c806b 100644
--- a/cmake/ttg-config.cmake.in
+++ b/cmake/ttg-config.cmake.in
@@ -7,11 +7,16 @@ set(TTG_EXT_VERSION "@TTG_EXT_VERSION@")
 set(PaRSEC_CONFIG "@PaRSEC_CONFIG@")
 set(MADNESS_CONFIG "@MADNESS_CONFIG@")
 set(Boost_CONFIG "@Boost_CONFIG@")
+set(CXX_COROUTINE_COMPONENT "@CXX_COROUTINE_COMPONENT@")
 
 set(TTG_TRACKED_BOOST_VERSION "@TTG_TRACKED_BOOST_VERSION@")
 
 set(TTG_IGNORE_BUNDLED_EXTERNALS @TTG_IGNORE_BUNDLED_EXTERNALS@)
 
+# make TTG CMake modules discoverable + load AddTTGExecutable by default
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules")
+include(AddTTGExecutable)
+
 @PACKAGE_INIT@
 
 if (NOT TARGET MADworld AND MADNESS_CONFIG)
@@ -24,12 +29,17 @@ if (NOT TARGET PaRSEC::parsec AND PaRSEC_CONFIG)
   find_package(PaRSEC CONFIG QUIET REQUIRED COMPONENTS parsec PATHS "${PaRSEC_CONFIG_DIR}" NO_DEFAULT_PATH)
 endif()
 
-# N.B. load Boost
+# if Boost was discovered and used at TTG configure time discover Boost at the same path
 if (NOT TARGET Boost::boost AND Boost_CONFIG)
   get_filename_component(Boost_CONFIG_DIR "${Boost_CONFIG}" DIRECTORY)
   find_package(Boost ${TTG_TRACKED_BOOST_VERSION} CONFIG QUIET REQUIRED OPTIONAL_COMPONENTS serialization PATHS "${Boost_CONFIG_DIR}" NO_DEFAULT_PATH)
 endif()
 
+# if C++ coroutines were used discover same version of them
+if (NOT TARGET std::coroutine AND CXX_COROUTINE_COMPONENT)
+  find_package(CXXStdCoroutine MODULE QUIET REQUIRED COMPONENTS "${CXX_COROUTINE_COMPONENT}")
+endif()
+
 # Include library IMPORT targets
 if(NOT TARGET ttg)
   include("${CMAKE_CURRENT_LIST_DIR}/ttg-targets.cmake")
@@ -47,8 +57,4 @@ if (NOT TARGET Boost::boost)
   endif()
 endif()
 
-# load CMake modules
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules")
-include(AddTTGExecutable)
-
 set(TTG_FOUND TRUE)
diff --git a/doc/dox/contrib/CI-Administration-Notes.md b/doc/dox/contrib/CI-Administration-Notes.md
index 32542b9709..c5dd9430af 100644
--- a/doc/dox/contrib/CI-Administration-Notes.md
+++ b/doc/dox/contrib/CI-Administration-Notes.md
@@ -4,7 +4,7 @@
 * TTG uses GitHub Actions (GHA) for its CI service
 * GHA CI configuration is in file `.github/workflows/cmake.yml`. Only Linux and MacOS builds are currently supported.
 * Unlike earlier CI setups, there is no need to cache TTG prerequisites; default system-wide packages are used for most prerequisites, and the rest is compiled from source every time. 
-* Doxygen documentation deployment uses a Github token that is defined as variable `GH_TTG_TOKEN` in GHA's TTG repo settings' [secrets](https://github.com/TESSEorg/ttg/settings/secrets/actions).
+* Doxygen documentation deployment uses a GitHub token that is defined as variable `GH_TTG_TOKEN` in GHA's TTG repo settings' [secrets](https://github.com/TESSEorg/ttg/settings/secrets/actions).
 
 # Debugging GitHub Actions jobs
 
diff --git a/doc/dox/contrib/Design-Device.md b/doc/dox/contrib/Design-Device.md
new file mode 100644
index 0000000000..2c40f64fda
--- /dev/null
+++ b/doc/dox/contrib/Design-Device.md
@@ -0,0 +1,175 @@
+# Device Task Design {#Design-Device}
+
+## problem statement
+TTG must be able to execute general user-defined graphs on machines with heterogeneous execution and address spaces, e.g., using multiple processes each having multiple CPU threads + device streams, with each thread/stream preferring or limited to a specific address range.
+
+## key concerns
+The key issues are how to manage:
+- the asynchrony of the device programming models, and
+- the heterogeneity of the address space.
+
+There are multiple "solutions" to each issue, hence there are many possible designs. I'll discuss each issue first, then outline the aggregate designs we are pursuing.
+
+### Memory:
+- *Unified Memory (UM)*: where available, use single address space (unified memory visible to both host and device executors; it is also possible to use pinned host memory for device calls)
+    - pro: simplifies memory management by removing the capacity limitation
+    - con: still needs user cooperation: all compute data must be allocated on UM heap, this impacts the design of
+      user data types, e.g. making them allocator aware, etc.
+    - con: the user will likely needs to use pooled memory management for efficiency reasons (e.g., TiledArray uses Umpire)
+    - con: still necessary to provide hints to the kernel driver managing UM.
+    - con: reasoning about UM driver performance is difficult, its details are opaque and platform dependent.
+- *Device Memory (DM)*: using "native" device memory.
+    - pro: simpler performance model due to greatest amount of control (by runtime) over execution
+    - pro: can work with stack-capable data types
+    - con: The amount is limited, hence this memory must be explicitly managed (akin to how a cache is managed).
+
+Additional memory-related concerns common to both models:
+- only partial state needs to be transferred to/from the device
+    - which part of the state will differ from algorithm to algorithm, hence encoding/constructing such representation cannot use constexpr code (such as traits)
+    - the need for _explicit_ handling of object's partial state is shared by both models
+        - UM: such optimization may seem automatic (only the pages of the data actually used on the device are transfered) but in practice the data must be explicitly prefetched, hence partial state transfers are not automatic; furthermore, the unit of UM transfer is a page (4k or more), which is too coarse for many applications
+        - DM: serialization of an entire object (which can leverage standard RDMA-like serialization), transfering partial state requires explicit annotation 
+    - hence it makes sense to make representation of object's partial state (`View`) a first-class concept in both models.
+
+### Asynchrony
+- *Continuations/stages*: decompose tasks into _continuations_ (stages), with runtime-managed scheduling of continutations for managing the asynchrony of the actions initiated by each continuation
+    - pro: most explicit, easier to reason about, fewest performance implications
+    - con: most verbose; device-capable tasks look very different from host tasks
+    - con: limited composability
+        - difficult to support general computation patterns (e.g. generator continuation, etc.,)
+- *"Threads"*: use threads to deal with the asynchrony (in principle could use user-space threads = fibers)
+    - pro: least host/device dichotomy
+        - tasks are ordinary (synchronous) functions
+        - fully composable
+    - con: performance implications
+        - due to the need to context switch to "yield" to other tasks
+        - thus even fully synchronous computations will suffer
+    - con: asynchrony artifacts still appear
+        - asynchronous calls must be in general annotated (to force synchronous execution and/or to provide hints to the thread scheduler)
+- *"Coroutines"*: use C++20 coroutines
+    - pro: less host/device dichotomy compared to continuations
+        - task functions "look" like ordinary functions (and can be made almost like normal functions using macros) but returning a custom return object (containing return status + handle to the coroutine) instead of void
+        - fully composable
+    - performance implications
+        - pro: no impact on synchronous tasks
+        - con: coroutine implementation details are complex and usually involve heap allocation
+        - pro: custom allocators can be introduced to elide heap allocation (at the cost of limited generality)
+    - con: asynchrony artifacts still appear
+        - co_await annotate the spots where execution may need to be suspended
+    - con: less mature due to the need for C++20
+        - GCC (10+), LLVM (8+) support coroutines
+        - TTG and all of its dependencies will be impacted by the raised standard requirement
+
+### other considerations
+
+- it's not possible to manage memory from the device code, hence all program logic, including _device-capable_ tasks, must execute on host executors. In principle if we restricted ourselves to a single-source language (SYLC-extended C++) we could write device capable tasks directly as device code, but current language limitations mandate wrapping everything into host code.
+- runtime is still responsible for managing the executor space heterogeneity (control where to launch a task) and asynchrony (events/host callbacks).
+
+## Current designs
+- *UM+threads*: use UM for memory management + threads for asynchrony
+- *DM+stages*: use Parsec's device memory pool manager + stage-decomposed tasks
+- *?M+coroutines*: UM/DM for memory + C++20 coroutines for handling the asynchrony
+
+### Example code: threads vs continuations vs coroutines
+
+How should we map the following host task onto the device?
+```cpp
+make_tt([](auto& key, auto& data1, auto& data2) -> void {
+    double data3 = blap::dot(data1.data(), data2.data());
+    if (data3 >= 0.)
+        send<0>(data1);
+    else
+        send<0>(data2);
+}
+```
+
+Ideally the task will receive `data1` and `data2` already transferred to the memory space(s) accessible from the device execution space:
+```cpp
+make_device_tt([](auto& key, auto& data1, auto& data2) -> void {
+    double data3 = blap::device_dot(data1.data(), data2.data());
+    if (data3 >= 0.)
+        send<0>(data1);
+    else
+        send<0>(data2);
+}
+```
+But now `data3` lives in the host memory so in general we must manage its transfer from the device. Hence either:
+- all intermediate data must be managed explicitly within the task, or
+- except for the cases where user types are aware of multiple memory spaces (but this makes the state of such types asynchronous).
+
+Here are the tentative device versions of this task in each of the 3 approaches (the memory details are omitted).
+
+#### Threads
+```cpp
+make_tt([](auto& key, auto& data1, auto& data2) -> void {
+    // stage 1
+    ConstView view1(data1);
+    ConstView view2(data2);
+    double data3;
+    View view3(data3, NewView | SyncView_D2H);
+    // depending on the memory model may need to wait here for the transfers to complete
+    // could build the waits into View ctors, or need an explicit await()
+    
+    // stage 2
+    cublasDdot(view1.device_ptr(), view2.device_ptr(), view3.device_ptr());
+    // if called an async function need explicit await() here
+    // also: who/how will view3 be synchronized
+
+    if (data3 >= 0.)
+        send<0>(data1);
+    else
+        send<0>(data2);
+}
+```
+N.B. `make_tt`: this is a regular task.
+
+#### Continuations
+```cpp
+make_device_tt(
+  // stage 1
+  [](auto& key, auto& data1, auto& data2) {
+    ConstView view1(data1);
+    ConstView view2(data2);
+    double data3;
+    View view3(data3, NewView | SyncView_D2H);
+    return {view1, view2, view3};
+    }, 
+  // stage 2
+  [](auto& key, auto& views) {
+    auto& [view1, view2, view3] = views;
+    cublasDdot(view1.device_ptr(), view2.device_ptr(), view3.device_ptr());
+  },
+  // stage 3
+  [](auto& key, auto& views) {
+    auto& [view1, view2, view3] = views;
+    if (*view3.host_ptr() >= 0.)
+        send<0>(data1);
+    else
+        send<0>(data2);
+    }
+}
+```
+N.B. `make_device_tt` vs `make_tt`: this is a special task.
+
+#### Coroutines
+```cpp
+make_tt([](auto& key, auto& data1, auto& data2) -> ttg::resumable_task {
+    // stage 1
+    ConstView view1(data1);
+    ConstView view2(data2);
+    double data3;
+    View view3(data3, NewView | SyncView_D2H);
+    co_await sync_views(view1, view2, view3);  // creates list of transfers to be fulfilled by the runtime
+    
+    // stage 2
+    cublasDdot(view1.device_ptr(), view2.device_ptr(), view3.device_ptr());
+    co_await;  // syncs view3; since transfers and kernels execute in different streams the runtime will sync kernel stream, then launch transfers, then resume here
+
+    if (data3 >= 0.)
+        send<0>(data1);
+    else
+        send<0>(data2);
+    co_return;  // processes sends and destroys coroutine
+}, ...);
+```
+N.B. `make_tt` and `ttg::resumable_task`: this is a regular task but with special return type.
diff --git a/doc/Pull-terminal-design-doc.md b/doc/dox/contrib/Pull-terminal-design-doc.md
similarity index 98%
rename from doc/Pull-terminal-design-doc.md
rename to doc/dox/contrib/Pull-terminal-design-doc.md
index 377c42b221..1af4ed11b7 100644
--- a/doc/Pull-terminal-design-doc.md
+++ b/doc/dox/contrib/Pull-terminal-design-doc.md
@@ -1,4 +1,4 @@
-# Pull Terminals - Design Notes
+# Pull Terminals Design Notes {#Design-Pull}
 
 ### Motivation
 
@@ -49,7 +49,3 @@
 
 - Should Pull Op be able to send data to multiple successors? Use cases?
 - Cholesky - why pull ops are needed?
-
-
-
-
diff --git a/doc/dox/contrib/top.md b/doc/dox/contrib/top.md
index 87fd1edd82..3cf224b2f4 100644
--- a/doc/dox/contrib/top.md
+++ b/doc/dox/contrib/top.md
@@ -1,7 +1,12 @@
 # Contributor Guide {#contribguide}
 
-* [TTG Build Intrastructure](@ref TTG-Build-Infrastructure)
+* Development
+  - [TTG Build Intrastructure](@ref TTG-Build-Infrastructure)
+  - [Documenting TTG](@ref Documenting-TTG)
+  - [Recommended Workflow Elements](@ref Recommended-Workflow-Elements)
+  - [CodingStandard](@ref Coding-Standard)
+* Design Notes
+  - [Pull Terminals](@ref Design-Pull)
+  - [Device Tasks](@ref Design-Device)
+* Maintenance
   - [Managing Continuous Integration (CI)](@ref CI-Administration-Notes)
-* [Documenting TTG](@ref Documenting-TTG)
-* [Recommended Workflow Elements](@ref Recommended-Workflow-Elements)
-* [CodingStandard](@ref Coding-Standard)
diff --git a/examples/device_mock/device_mock.cc b/examples/device_mock/device_mock.cc
new file mode 100644
index 0000000000..8722591e70
--- /dev/null
+++ b/examples/device_mock/device_mock.cc
@@ -0,0 +1,188 @@
+// clang-format off
+
+#include <ttg.h>
+#include <ttg/view.h>
+#include "../matrixtile.h"
+
+#include <parsec.h>
+#include <parsec/data_internal.h>
+#include <parsec/data_dist/matrix/matrix.h>
+#include <parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h>
+#include <parsec/data_dist/matrix/two_dim_rectangle_cyclic.h>
+
+#include "ttg/util/meta.h"
+
+#include <ttg/util/multiindex.h>
+
+using Key2 = ttg::MultiIndex<2>;
+using Key3 = ttg::MultiIndex<3>;
+
+/* number of tiles */
+#define KT 100
+
+template<typename T>
+auto make_gemm(ttg::Edge<Key3, MatrixTile<T>>& A,
+               ttg::Edge<Key3, MatrixTile<T>>& B,
+               ttg::Edge<Key2, MatrixTile<T>>& output_result)
+{
+
+  ttg::Edge<Key3, MatrixTile<double>> C;
+  auto f_cpu = [=](const Key3& key,
+                   const MatrixTile<T>&  A,
+                   const MatrixTile<T>&  B,
+                   MatrixTile<T>&  C,
+                   std::tuple<ttg::Out<Key2, MatrixTile<T>>,
+                              ttg::Out<Key3, MatrixTile<T>>>& out)
+  {
+    int m = key[0];
+    int n = key[1];
+    int k = key[2];
+
+    /*
+    if(k == 0) {
+      dlprng(C.data(), 1789, A.mb()*B.nb());
+    }
+    dgemm(A.data(), A.mb(), A.nb(),
+          B.data(), B.mb(), B.nb(),
+          1.0,
+          C.data(), C.nb(), C.nb());
+    */
+
+    if( k == KT-1 || C.data()[0] < 1e-9 ) {
+      ttg::send<0>(Key2{m, n}, std::move(C));
+    } else {
+      ttg::send<1>(Key3{m, n, k+1}, std::move(C));
+    }
+  };
+
+  auto f_gpu_host_views = [=](const Key3& key,
+                              const MatrixTile<T>& A,
+                              const MatrixTile<T>& B,
+                              MatrixTile<T>&& C)
+  {
+    // The default ViewScope::SyncIn scope tells the runtime that the data should be copied
+    // to the device before the kernel callable is invoked.
+    ttg::View<const MatrixTile<T>, const T> dev_A = ttg::make_view( A, ttg::ViewSpan(A.data(), A.size()) );
+    ttg::View<const MatrixTile<T>, const T> dev_B = ttg::make_view( B, ttg::ViewSpan(B.data(), B.size()) );
+    ttg::View<MatrixTile<T>, T> dev_C;
+    ttg::View<T, T> dev_tmp;
+    T *host_tmp = new(T);
+    // ViewScope::SyncOut tells the runtime system that the view should be synchronized back to the
+    // host before invoking the output callable.
+    dev_tmp = ttg::make_view( *host_tmp, ttg::ViewSpan(host_tmp, 1, ttg::ViewScope::SyncOut) );
+
+    int k = key[2];
+    if(0 == k) {
+      // ViewScope::Allocate tells the runtime system that the device view needs to be allocated but doesn't need to be
+      // initialized with C.data(). However, C.data() is still associated with the device memory, so if the
+      // runtime system evicts that data from the device, it will be first copied back into C.data().
+      dev_C = ttg::make_view( C, ttg::ViewSpan(C.data(), C.size(), ttg::ViewScope::Allocate) );
+    } else {
+      dev_C = ttg::make_view( C, ttg::ViewSpan(C.data(), C.size()) );
+    }
+
+    return std::make_tuple(dev_A, dev_B, dev_C, dev_tmp);
+  };
+
+  auto f_gpu_kernel = [=](const Key3& key,
+                                ttg::View<const MatrixTile<T>, const T>& dev_A,
+                                ttg::View<const MatrixTile<T>, const T>& dev_B,
+                                ttg::View<MatrixTile<T>, T>& dev_C,
+                          ttg::View<T, T>& dev_tmp)
+  {
+    int k = key[2];
+
+    const MatrixTile<T>&  A = dev_A.get_host_object();
+    const MatrixTile<T>&  B = dev_B.get_host_object();
+    MatrixTile<T>&  C = dev_C.get_host_object();
+    T& host_tmp = dev_tmp.get_host_object();
+    auto beta = 1.0;
+    if(k == 0) {
+        //cublasDplrng(dev_C.get(0), C.mb(), C.nb());
+    }
+
+    /*
+    cublasDgemm(dev_A.get(0), A.mb(), A.nb(),
+                dev_B.get(0), B.mb(), B.nb(),
+                beta,
+                dev_C.get(0), C.mb(), C.nb());
+
+    cudaMemcpyAsync(&dev_C.get(0)[0], host_tmp, sizeof(T), cudaDeviceToHost);
+    */
+  };
+
+  auto f_gpu_output_flows = [=](const Key3& key,
+                                const MatrixTile<T>&  A,
+                                const MatrixTile<T>&  B,
+                                MatrixTile<T>&  C,
+                                T& host_tmp)
+  {
+    int m = key[0];
+    int n = key[1];
+    int k = key[2];
+
+    if( k == KT-1 || host_tmp < 1e-9 ) {
+        ttg::send<0>(Key2{m, n}, std::move(C));
+    } else {
+      ttg::send<1>(Key3{m, n, k+1}, std::move(C));
+    }
+    delete &host_tmp;
+  };
+
+  //ttg::meta::type_printer<decltype(ttg::edges(output_result, C))> x;
+
+  /* If we only have GPU */
+  auto gemm_tt = ttg::make_device_tt<Key3>(f_gpu_host_views, f_gpu_kernel, f_gpu_output_flows, ttg::ExecutionSpace::CUDA,
+                                     ttg::edges(A, B, C), ttg::edges(output_result, C),
+                                     "GEMM", {"A", "B", "C"}, {"output_result", "C"});
+
+#if 0
+  /* Alternative: to get both type of tasklets: */
+  auto gemm_tt = ttg::make_device_tt(f_cpu, f_gpu_host_views, f_gpu_kernel, f_gpu_output_flows, ttg::ExecutionSpace::CUDA,
+                                     ttg::edges(A, B), ttg::edges(output_result, C),
+                                     "GEMM", {"A", "B"}, {"output_result", "C"});
+#endif
+  return gemm_tt;
+}
+
+int main(int argc, char **argv)
+{
+
+  std::chrono::time_point<std::chrono::high_resolution_clock> beg, end;
+  int N = 1024;
+  int M = N;
+  int NB = 128;
+  int check = 0;
+  int nthreads = -1;
+  const char* prof_filename = nullptr;
+
+  if (argc > 1) {
+    N = M = atoi(argv[1]);
+  }
+
+  if (argc > 2) {
+    NB = atoi(argv[2]);
+  }
+
+  if (argc > 3) {
+    check = atoi(argv[3]);
+  }
+
+  if (argc > 4) {
+    nthreads = atoi(argv[4]);
+  }
+
+  ttg::initialize(argc, argv, nthreads);
+
+  auto world = ttg::default_execution_context();
+
+  ttg::Edge<Key3, MatrixTile<double>> edge_a, edge_b;
+  ttg::Edge<Key2, MatrixTile<double>> edge_out;
+
+  auto gemm_tt = make_gemm(edge_a, edge_b, edge_out);
+
+  ttg::fence();
+
+  ttg::finalize();
+  return 0;
+}
diff --git a/examples/madness/mrattg.cc b/examples/madness/mrattg.cc
index 47bce1ce94..2b2d6c113b 100644
--- a/examples/madness/mrattg.cc
+++ b/examples/madness/mrattg.cc
@@ -124,6 +124,46 @@ auto make_project(functorT& f,
   return ttg::make_tt(F, edges(fuse(refine, ctl)), edges(refine, result), name, {"control"}, {"refine", "result"});
 }
 
+
+/// Returns an std::unique_ptr to the object
+template <typename functorT, typename T, size_t K, Dimension NDIM>
+auto make_project_device(functorT& f,
+                         const T thresh,  /// should be scalar value not complex
+                         ctlEdge<NDIM>& ctl, rnodeEdge<T, K, NDIM>& result, const std::string& name = "project") {
+  auto F = [f, thresh](const Key<NDIM>& key, std::tuple<ctlOut<NDIM>, rnodeOut<T, K, NDIM>>& out) {
+    FunctionReconstructedNode<T, K, NDIM> node(key);  // Our eventual result
+    auto& coeffs = node.coeffs;                       // Need to clean up OO design
+    bool is_leaf;
+
+    if (key.level() < initial_level(f)) {
+      for (auto child : children(key)) ttg::sendk<0>(child, out);
+      coeffs = T(1e7);  // set to obviously bad value to detect incorrect use
+      is_leaf = false;
+    } else if (is_negligible<functorT, T, NDIM>(f, Domain<NDIM>::template bounding_box<T>(key),
+                                                truncate_tol(key, thresh))) {
+      coeffs = T(0.0);
+      is_leaf = true;
+    } else {
+      auto node_view  = ttg::make_view(node, ttg::ViewScope::Out); // no need to move node onto the device
+      auto is_leaf_view = ttg::make_view(is_leaf, ttg::ViewScope::Out);
+      co_await ttg::device::wait_views{};
+      fcoeffs<functorT, T, K>(f, key, thresh,
+                              node_view.get_device_ptr<0>(),
+                              is_leaf_view.get_device_ptr<0>());  // cannot deduce K
+      co_await ttg::device::wait_kernel{};
+      if (!is_leaf) {
+        for (auto child : children(key)) ttg::sendk<0>(child, out);  // should be broadcast ?
+      }
+    }
+    node.is_leaf = is_leaf;
+    ttg::send<1>(key, node, out);  // always produce a result
+  };
+  ctlEdge<NDIM> refine("refine");
+  return ttg::make_tt(F, edges(fuse(refine, ctl)), edges(refine, result), name, {"control"}, {"refine", "result"});
+}
+
+
+
 namespace detail {
   template <typename T, size_t K, Dimension NDIM>
   struct tree_types {};
diff --git a/examples/potrf/pmw.h b/examples/potrf/pmw.h
index 14fc6ce5b1..076f1fffca 100644
--- a/examples/potrf/pmw.h
+++ b/examples/potrf/pmw.h
@@ -23,13 +23,13 @@ struct type2matrixtype
 template<>
 struct type2matrixtype<float>
 {
-    static constexpr const matrix_type value = matrix_type::matrix_RealFloat;
+    static constexpr const parsec_matrix_type_t value = parsec_matrix_type_t::PARSEC_MATRIX_FLOAT;
 };
 
 template<>
 struct type2matrixtype<double>
 {
-    static constexpr const matrix_type value = matrix_type::matrix_RealDouble;
+    static constexpr const parsec_matrix_type_t value = parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE;
 };
 
 template<typename PaRSECMatrixT, typename ValueT>
@@ -97,8 +97,8 @@ class PaRSECMatrixWrapper {
   }
 
   bool in_matrix(int row, int col) const {
-    return (pm->uplo == matrix_Lower && col <= row) ||
-           (pm->uplo == matrix_Upper && col >= row);
+    return (pm->uplo == PARSEC_MATRIX_LOWER && col <= row) ||
+           (pm->uplo == PARSEC_MATRIX_UPPER && col >= row);
   }
 
   PaRSECMatrixT* parsec() {
diff --git a/examples/potrf/testing_dlauum.cc b/examples/potrf/testing_dlauum.cc
index 0cd7c863bf..7176000943 100644
--- a/examples/potrf/testing_dlauum.cc
+++ b/examples/potrf/testing_dlauum.cc
@@ -6,6 +6,7 @@
 #endif  // TTG_USE_PARSEC
 
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "lauum.h"
 #include "plgsy.h"
@@ -71,10 +72,10 @@ int main(int argc, char **argv)
 
   std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl;
 
-  sym_two_dim_block_cyclic_t dcA;
-  sym_two_dim_block_cyclic_init(&dcA, matrix_type::matrix_RealDouble,
-                                world.size(), world.rank(), NB, NB, N, M,
-                                0, 0, N, M, P, matrix_Lower);
+  parsec_matrix_sym_block_cyclic_t dcA;
+  parsec_matrix_sym_block_cyclic_init(&dcA, parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE,
+                                world.rank(), NB, NB, N, M,
+                                0, 0, N, M, P, Q, PARSEC_MATRIX_LOWER);
   dcA.mat = parsec_data_allocate((size_t)dcA.super.nb_local_tiles *
                                  (size_t)dcA.super.bsiz *
                                  (size_t)parsec_datadist_getsizeoftype(dcA.super.mtype));
@@ -134,7 +135,7 @@ int main(int argc, char **argv)
   //delete A;
   /* cleanup allocated matrix before shutting down PaRSEC */
   parsec_data_free(dcA.mat); dcA.mat = NULL;
-  parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA);
+  parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA);
 
   world.dag_off();
   world.profile_off();
@@ -142,3 +143,35 @@ int main(int argc, char **argv)
   ttg::finalize();
   return ret;
 }
+
+static void
+dplasma_dprint_tile( int m, int n,
+                     const parsec_tiled_matrix_t* descA,
+                     const double *M )
+{
+    int tempmm = ( m == descA->mt-1 ) ? descA->m - m*descA->mb : descA->mb;
+    int tempnn = ( n == descA->nt-1 ) ? descA->n - n*descA->nb : descA->nb;
+    int ldam = BLKLDD( descA, m );
+
+    int ii, jj;
+
+    fflush(stdout);
+    for(ii=0; ii<tempmm; ii++) {
+        if ( ii == 0 )
+            fprintf(stdout, "(%2d, %2d) :", m, n);
+        else
+            fprintf(stdout, "          ");
+        for(jj=0; jj<tempnn; jj++) {
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            fprintf(stdout, " (% e, % e)",
+                    creal( M[jj*ldam + ii] ),
+                    cimag( M[jj*ldam + ii] ));
+#else
+            fprintf(stdout, " % e", M[jj*ldam + ii]);
+#endif
+        }
+        fprintf(stdout, "\n");
+    }
+    fflush(stdout);
+    usleep(1000);
+}
diff --git a/examples/potrf/testing_dpoinv.cc b/examples/potrf/testing_dpoinv.cc
index 5ce70904b3..eeb136c188 100644
--- a/examples/potrf/testing_dpoinv.cc
+++ b/examples/potrf/testing_dpoinv.cc
@@ -6,6 +6,8 @@
 #endif  // TTG_USE_PARSEC
 
 #include <ttg.h>
+#include <ttg/device.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "plgsy.h"
 #include "pmw.h"
@@ -115,10 +117,15 @@ int main(int argc, char **argv)
     std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl;
   }
 
-  sym_two_dim_block_cyclic_t dcA;
-  sym_two_dim_block_cyclic_init(&dcA, matrix_type::matrix_RealDouble,
-                                world.size(), world.rank(), NB, NB, N, M,
-                                0, 0, N, M, P, matrix_Lower);
+  parsec_matrix_sym_block_cyclic_t dcA;
+  parsec_matrix_sym_block_cyclic_init(&dcA, parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE,
+                                world.rank(), 
+                                NB, NB, 
+                                N, M,
+                                0, 0, 
+                                N, M, 
+                                P, Q, 
+                                PARSEC_MATRIX_LOWER);
   dcA.mat = parsec_data_allocate((size_t)dcA.super.nb_local_tiles *
                                  (size_t)dcA.super.bsiz *
                                  (size_t)parsec_datadist_getsizeoftype(dcA.super.mtype));
@@ -387,7 +394,7 @@ int main(int argc, char **argv)
   //delete A;
   /* cleanup allocated matrix before shutting down PaRSEC */
   parsec_data_free(dcA.mat); dcA.mat = NULL;
-  parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA);
+  parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA);
 
   ttg::finalize();
   return ret;
diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc
index 365a10ddce..d5686042fc 100644
--- a/examples/potrf/testing_dpotrf.cc
+++ b/examples/potrf/testing_dpotrf.cc
@@ -1,4 +1,5 @@
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "plgsy.h"
 #include "pmw.h"
@@ -75,10 +76,10 @@ int main(int argc, char **argv)
 
   std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl;
 
-  sym_two_dim_block_cyclic_t dcA;
-  sym_two_dim_block_cyclic_init(&dcA, matrix_type::matrix_RealDouble,
-                                world.size(), world.rank(), NB, NB, N, M,
-                                0, 0, N, M, P, matrix_Lower);
+  parsec_matrix_sym_block_cyclic_t dcA;
+  parsec_matrix_sym_block_cyclic_init(&dcA, parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE,
+                                world.rank(), NB, NB, N, M,
+                                0, 0, N, M, P, Q, PARSEC_MATRIX_LOWER);
   dcA.mat = parsec_data_allocate((size_t)dcA.super.nb_local_tiles *
                                  (size_t)dcA.super.bsiz *
                                  (size_t)parsec_datadist_getsizeoftype(dcA.super.mtype));
@@ -202,7 +203,7 @@ int main(int argc, char **argv)
 
   /* cleanup allocated matrix before shutting down PaRSEC */
   parsec_data_free(dcA.mat); dcA.mat = NULL;
-  parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA);
+  parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA);
 
   world.profile_off();
 
@@ -210,6 +211,38 @@ int main(int argc, char **argv)
   return ret;
 }
 
+static void
+dplasma_dprint_tile( int m, int n,
+                     const parsec_tiled_matrix_t* descA,
+                     const double *M )
+{
+    int tempmm = ( m == descA->mt-1 ) ? descA->m - m*descA->mb : descA->mb;
+    int tempnn = ( n == descA->nt-1 ) ? descA->n - n*descA->nb : descA->nb;
+    int ldam = BLKLDD( descA, m );
+
+    int ii, jj;
+
+    fflush(stdout);
+    for(ii=0; ii<tempmm; ii++) {
+        if ( ii == 0 )
+            fprintf(stdout, "(%2d, %2d) :", m, n);
+        else
+            fprintf(stdout, "          ");
+        for(jj=0; jj<tempnn; jj++) {
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            fprintf(stdout, " (% e, % e)",
+                    creal( M[jj*ldam + ii] ),
+                    cimag( M[jj*ldam + ii] ));
+#else
+            fprintf(stdout, " % e", M[jj*ldam + ii]);
+#endif
+        }
+        fprintf(stdout, "\n");
+    }
+    fflush(stdout);
+    usleep(1000);
+}
+
 int check_dpotrf( double *A, double *A0, int N )
 {
     int ret;
diff --git a/examples/potrf/testing_dtrtri.cc b/examples/potrf/testing_dtrtri.cc
index f8f0d2ecc1..107fbf8c22 100644
--- a/examples/potrf/testing_dtrtri.cc
+++ b/examples/potrf/testing_dtrtri.cc
@@ -6,6 +6,7 @@
 #endif  // TTG_USE_PARSEC
 
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "plgsy.h"
 #include "pmw.h"
@@ -92,10 +93,10 @@ int main(int argc, char **argv)
 
   std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl;
 
-  sym_two_dim_block_cyclic_t dcA;
-  sym_two_dim_block_cyclic_init(&dcA, matrix_type::matrix_RealDouble,
-                                world.size(), world.rank(), NB, NB, N, M,
-                                0, 0, N, M, P, uplo == lapack::Uplo::Lower ? matrix_Lower : matrix_Upper);
+  parsec_matrix_sym_block_cyclic_t dcA;
+  parsec_matrix_sym_block_cyclic_init(&dcA, parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE,
+                                world.rank(), NB, NB, N, M,
+                                0, 0, N, M, P, Q, uplo == lapack::Uplo::Lower ? PARSEC_MATRIX_LOWER : PARSEC_MATRIX_UPPER);
   dcA.mat = parsec_data_allocate((size_t)dcA.super.nb_local_tiles *
                                  (size_t)dcA.super.bsiz *
                                  (size_t)parsec_datadist_getsizeoftype(dcA.super.mtype));
@@ -222,7 +223,7 @@ int main(int argc, char **argv)
   }
 
   parsec_data_free(dcA.mat); dcA.mat = NULL;
-  parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA);
+  parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA);
 
   world.dag_off();
   world.profile_off();
@@ -231,6 +232,38 @@ int main(int argc, char **argv)
   return ret;
 }
 
+static void
+dplasma_dprint_tile( int m, int n,
+                     const parsec_tiled_matrix_t* descA,
+                     const double *M )
+{
+    int tempmm = ( m == descA->mt-1 ) ? descA->m - m*descA->mb : descA->mb;
+    int tempnn = ( n == descA->nt-1 ) ? descA->n - n*descA->nb : descA->nb;
+    int ldam = BLKLDD( descA, m );
+
+    int ii, jj;
+
+    fflush(stdout);
+    for(ii=0; ii<tempmm; ii++) {
+        if ( ii == 0 )
+            fprintf(stdout, "(%2d, %2d) :", m, n);
+        else
+            fprintf(stdout, "          ");
+        for(jj=0; jj<tempnn; jj++) {
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            fprintf(stdout, " (% e, % e)",
+                    creal( M[jj*ldam + ii] ),
+                    cimag( M[jj*ldam + ii] ));
+#else
+            fprintf(stdout, " % e", M[jj*ldam + ii]);
+#endif
+        }
+        fprintf(stdout, "\n");
+    }
+    fflush(stdout);
+    usleep(1000);
+}
+
 int check_dtrtri( lapack::Diag diag, lapack::Uplo uplo, double *A, double *Ainv, int N )
 {
     int ret;
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 2016872e25..c2db7bd3f7 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -1,7 +1,23 @@
 include(AddTTGExecutable)
 
 # TT unit test: core TTG ops
-add_ttg_executable(core-unittests-ttg "fibonacci.cc;ranges.cc;tt.cc;unit_main.cpp" LINK_LIBRARIES "Catch2::Catch2")
+set(ut_src
+        #fibonacci.cc
+        #ranges.cc
+        #tt.cc
+        unit_main.cpp
+    )
+set(ut_libs Catch2::Catch2)
+#if (TARGET std::coroutine)
+    #list(APPEND ut_src fibonacci-coro.cc)
+    list(APPEND ut_src device_coro.cc)
+    if (TTG_HAVE_CUDA)
+        list(APPEND ut_src cuda_kernel.cu)
+    endif(TTG_HAVE_CUDA)
+    list(APPEND ut_libs std::coroutine)
+#endif()
+add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}")
+
 
 # serialization test: probes serialization via all supported serialization methods (MADNESS, Boost::serialization, cereal) that are available
 add_executable(serialization "serialization.cc;unit_main.cpp")
diff --git a/tests/unit/cuda_kernel.cu b/tests/unit/cuda_kernel.cu
new file mode 100644
index 0000000000..b97611fe5f
--- /dev/null
+++ b/tests/unit/cuda_kernel.cu
@@ -0,0 +1,22 @@
+
+#include "cuda_kernel.h"
+
+#ifdef TTG_HAVE_CUDA
+
+static __global__ void cu_increment_buffer(double* buffer, double* scratch) {
+  // Thread index
+  int tx = threadIdx.x;
+
+  buffer[tx] += 1.0;
+  if (tx == 0 && scratch != nullptr) {
+    *scratch = buffer[tx];
+  }
+}
+
+void increment_buffer(double* buffer, std::size_t buffer_size, double* scratch, std::size_t scratch_size) {
+
+  cu_increment_buffer<<<1, buffer_size>>>(buffer, scratch);
+
+}
+
+#endif // TTG_HAVE_CUDA
\ No newline at end of file
diff --git a/tests/unit/cuda_kernel.h b/tests/unit/cuda_kernel.h
new file mode 100644
index 0000000000..4fec87a999
--- /dev/null
+++ b/tests/unit/cuda_kernel.h
@@ -0,0 +1,4 @@
+#include "ttg/config.h"
+#include <cinttypes>
+
+void increment_buffer(double* buffer, std::size_t buffer_size, double* scratch, std::size_t scratch_size);
\ No newline at end of file
diff --git a/tests/unit/device_coro.cc b/tests/unit/device_coro.cc
new file mode 100644
index 0000000000..5b51b0724d
--- /dev/null
+++ b/tests/unit/device_coro.cc
@@ -0,0 +1,220 @@
+#include <catch2/catch.hpp>
+
+#include "ttg.h"
+#include "ttg/view.h"
+
+#include "cuda_kernel.h"
+
+struct value_t {
+  ttg::buffer<double> db; // TODO: rename
+  int quark;
+
+  template<typename Archive>
+  void ttg_serialize(Archive& ar) {
+    ar& quark;
+    ar& db; // input:
+  }
+};
+
+/* devicebuf is non-POD so provide serialization
+ * information for members not a devicebuf */
+namespace madness::archive {
+  template <class Archive>
+  struct ArchiveSerializeImpl<Archive, value_t> {
+    static inline void serialize(const Archive& ar, value_t& obj) { ar& obj.quark & obj.db; };
+  };
+}  // namespace madness::archive
+
+
+TEST_CASE("Device", "coro") {
+
+  SECTION("devicebuf") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device_task {
+      ttg::print("device_task key ", key);
+      /* wait for the view to be available on the device */
+      co_await ttg::to_device(val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(val.db.current_device_ptr() != nullptr);
+
+      /* NO KERNEL */
+
+      /* here we suspend to wait for a kernel to complete */
+      co_await ttg::wait_kernel();
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        //ttg::send<0>(key+1, std::move(val));
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val)));
+      }
+    };
+
+    //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("scratch") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device_task {
+      double scratch = 0.0;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::Allocate);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::to_device(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      /* call a kernel */
+#ifdef TTG_HAVE_CUDA
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+#endif // TTG_HAVE_CUDA
+
+      /* here we suspend to wait for a kernel to complete */
+      co_await ttg::wait_kernel(ds);
+
+#ifdef TTG_HAVE_CUDA
+      /* buffer is increment once per task, so it should be the same as key */
+      CHECK((static_cast<int>(scratch)-1) == key);
+#endif // 0
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        //ttg::send<0>(key+1, std::move(val));
+        /* NOTE: we use co_await here instead of co_return because co_return destroys all local variables first;
+         *       we will not return from this co_await!*/
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val)));
+      }
+    };
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("ptr") {
+
+    ttg::Edge<int, value_t> edge;
+    ttg::Ptr<value_t> ptr;
+    int last_key = 0;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device_task {
+      double scratch = 1.0;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::SyncIn);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::to_device(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      /* KERNEL */
+#ifdef TTG_HAVE_CUDA
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+#endif // TTG_HAVE_CUDA
+
+      /* here we suspend to wait for a kernel and the out-transfer to complete */
+      co_await ttg::wait_kernel(val.db, ds);
+
+#ifdef TTG_HAVE_CUDA
+      /* buffer is increment once per task, so it should be the same as key */
+      CHECK(static_cast<int>(scratch) == key+1);
+      CHECK(static_cast<int>(*val.db.host_ptr()) == key+1);
+#endif // TTG_HAVE_CUDA
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10 || scratch < 0.0) {
+        //ttg::send<0>(key+1, std::move(val));
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val)));
+      } else {
+        /* exfiltrate the value */
+        /* TODO: what consistency do we expect from get_ptr? */
+        ptr = ttg::get_ptr(val);
+        last_key = key;
+      }
+    };
+
+    //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+    CHECK(ptr.is_valid());
+
+    /* feed the ptr back into a graph */
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(last_key+1, ptr);
+    ttg::ttg_fence(ttg::default_execution_context());
+
+    ptr.reset();
+  }
+
+#if 0
+  /* TODO: enabel this test once we control the PaRSEC state machine! */
+  SECTION("device_host_tasks") {
+
+    ttg::Edge<int, value_t> h2d, d2h;
+
+    auto host_fn = [&](const int& key, value_t&& val) {
+      /* check that the data has been synced back */
+      CHECK(static_cast<int>(*val.db.host_ptr()) == key);
+
+      /* modify the data */
+      *val.db.host_ptr() += 1.0;
+      CHECK(static_cast<int>(*val.db.host_ptr()) == key+1);
+
+      /* send back to the device */
+      ttg::send<0>(key+1, std::move(val));
+    };
+    auto htt = ttg::make_tt(host_fn, ttg::edges(d2h), ttg::edges(h2d),
+                            "host_task", {"d2h"}, {"h2d"});
+
+    auto device_fn = [&](const int& key, value_t&& val) -> ttg::device_task {
+      double scratch = 0.0;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::SyncOut);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::to_device(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      /* call a kernel */
+#ifdef TTG_HAVE_CUDA
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+#endif // TTG_HAVE_CUDA
+
+      /* here we suspend to wait for a kernel to complete */
+      co_await ttg::wait_kernel();
+
+#ifdef TTG_HAVE_CUDA
+      /* buffer is increment once per task, so it should be the same as key */
+      CHECK((static_cast<int>(scratch)-1) == key);
+#endif // 0
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        ttg::send<0>(key+1, std::move(val));
+      }
+    };
+
+    auto dtt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(device_fn, ttg::edges(h2d), ttg::edges(d2h),
+                                                      "device_task", {"h2d"}, {"d2h"});
+    make_graph_executable(dtt);
+    if (ttg::default_execution_context().rank() == 0) htt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+
+#endif // 0
+
+}
\ No newline at end of file
diff --git a/tests/unit/fibonacci-coro.cc b/tests/unit/fibonacci-coro.cc
new file mode 100644
index 0000000000..e8ed5d1ba2
--- /dev/null
+++ b/tests/unit/fibonacci-coro.cc
@@ -0,0 +1,122 @@
+#include <catch2/catch.hpp>
+
+#include "ttg.h"
+
+#include "ttg/serialization/std/pair.h"
+#include "ttg/util/hash/std/pair.h"
+
+#include "ttg/util/coroutine.h"
+
+constexpr int64_t N = 1000;
+
+TEST_CASE("Fibonacci-coroutines", "[fib][core]") {
+  // compute the reference result
+  int reference_result = 0;
+  {
+    // recursive lambda pattern from http://pedromelendez.com/blog/2015/07/16/recursive-lambdas-in-c14/
+    auto compute_reference_result = [&reference_result](int f_np1, int f_n) {
+      auto impl = [&reference_result](int f_np1, int f_n, const auto &impl_ref) -> void {
+        assert(f_n < N);
+        reference_result += f_n;
+        if (f_np1 < N) {
+          const auto f_np2 = f_np1 + f_n;
+          impl_ref(f_np2, f_np1, impl_ref);
+        }
+      };
+      impl(f_np1, f_n, impl);
+    };
+    compute_reference_result(1, 0);
+  }
+
+  SECTION("shared-memory") {
+    if (ttg::default_execution_context().size() == 1) {
+      ttg::Edge<int, int> F2F;
+      ttg::Edge<void, int> F2P;
+
+      // N.B. wrap a trivial (nonsuspending) coroutine using make_tt!
+      auto fib_op = ttg::make_tt(
+          // computes next value: F_{n+2} = F_{n+1} + F_{n}, seeded by F_1 = 1, F_0 = 0
+          // N.B. can't autodeduce return type, must explicitly declare the return type
+          [](const int &F_n_plus_1, const int &F_n) -> ttg::resumable_task {
+            // on 1 process the right order of sends can avoid the race iff reductions are inline (on-current-thread)
+            // and not async (nthread>1):
+            // - send<1> will call wc->set_arg which will eagerly reduce the argument
+            // - send<0> then will call wa->set_arg which will create task for key F_np2 ... that can potentially call
+            // finalize<1> in the other clause
+            // - reversing the order of sends will create a race between wc->set_arg->send<1> executing on this thread
+            // and wa->set_arg->finalize<1> executing in thread pool
+            // - there is no way to detect the "undesired" outcome of the race without keeping expired TTArgs from the
+            // cache there is no way currently to avoid race if there is more than 1 process ... need to track the
+            // number of messages that the reducing terminal will receive, that's what distributed example demonstrates.
+            // The order of operations will still matter.
+            if (F_n_plus_1 < N) {
+              const auto F_n_plus_2 = F_n_plus_1 + F_n;
+              // cool, if there are no events to wait for co_await is no-op
+              co_await ttg::resumable_task_events{};
+              ttg::sendv<1>(F_n_plus_1);
+              ttg::send<0>(F_n_plus_2, F_n_plus_1);
+            } else
+              ttg::finalize<1>();
+
+            // to test coro-based task lifecycle introduce fake events
+            ttg::event null_event;
+            co_await ttg::resumable_task_events{null_event};
+
+            // N.B. return void just as normal TT op
+            co_return;
+          },
+          ttg::edges(F2F), ttg::edges(F2F, F2P));
+      auto print_op = ttg::make_tt(
+          [reference_result](const int &value, std::tuple<> &out) {
+            ttg::print("sum of Fibonacci numbers up to ", N, " = ", value);
+            CHECK(value == reference_result);
+          },
+          ttg::edges(F2P), ttg::edges());
+      print_op->set_input_reducer<0>([](int &a, const int &b) { a = a + b; });
+      make_graph_executable(fib_op);
+      if (ttg::default_execution_context().rank() == 0) fib_op->invoke(1, 0);
+      ttg::ttg_fence(ttg::default_execution_context());
+    }
+  }
+
+  // in distributed memory we must count how many messages the reducer will receive
+  SECTION("distributed-memory") {
+    ttg::Edge<int, std::pair<int, int>> F2F;
+    ttg::Edge<void, int> F2P;
+    const auto nranks = ttg::default_execution_context().size();
+
+    auto fib_op = ttg::make_tt(
+        // computes next value: F_{n+2} = F_{n+1} + F_{n}, seeded by F_1 = 1, F_0 = 0
+        [](const int &n, const std::pair<int, int> &F_np1_n) {
+          const auto &[F_n_plus_1, F_n] = F_np1_n;
+          if (F_n_plus_1 < N) {
+            const auto F_n_plus_2 = F_n_plus_1 + F_n;
+            ttg::print("sent ", F_n_plus_1, " to fib reducer");
+            ttg::sendv<1>(F_n_plus_1);
+            ttg::send<0>(n + 1, std::make_pair(F_n_plus_2, F_n_plus_1));
+          } else {
+            // how many messages the reducer should expect to receive
+            ttg::set_size<1>(n);
+            ttg::print("fib reducer will expect ", n, " messages");
+          }
+        },
+        ttg::edges(F2F), ttg::edges(F2F, F2P));
+    auto print_op = ttg::make_tt(
+        [reference_result](const int &value, std::tuple<> &out) {
+          ttg::print("sum of Fibonacci numbers up to ", N, " = ", value);
+          CHECK(value == reference_result);
+        },
+        ttg::edges(F2P), ttg::edges());
+    // move all fib tasks to last rank, all reductions will happen on 0 => for some reason no reductions occur!
+    fib_op->set_keymap([=](const auto &key) { return nranks - 1; });
+    fib_op->set_trace_instance(true);
+    print_op->set_input_reducer<0>([](int &a, const int &b) {
+      ttg::print("fib reducer: current value = ", a, ", incremented by ", b, " set to ", a + b);
+      a = a + b;
+    });
+    make_graph_executable(fib_op);
+    ttg::ttg_fence(ttg::default_execution_context());
+    if (ttg::default_execution_context().rank() == 0) fib_op->invoke(0, std::make_pair(1, 0));
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+}  // TEST_CAST("Fibonacci")
diff --git a/tests/unit/tt.cc b/tests/unit/tt.cc
index f77a483ef9..def14684a5 100644
--- a/tests/unit/tt.cc
+++ b/tests/unit/tt.cc
@@ -146,7 +146,7 @@ namespace tt_i_iv {
 
   template <typename K, typename D1, typename D2>
   void func0(K &key, D1 &datum1, D2 &&datum2) {
-    abort();
+    ttg::abort();
   }
 }  // namespace tt_i_iv
 
@@ -298,6 +298,9 @@ TEST_CASE("TemplateTask", "[core]") {
         // OK: all of {auto&&, auto&, const auto&} bind to const T&
         static_assert(std::is_invocable<decltype(func0), int, const float &, const float &, const float &,
                                         const float &, std::tuple<> &>::value);
+        static_assert(std::is_same_v<std::invoke_result_t<decltype(func0), int, const float &, const float &,
+                                                          const float &, const float &, std::tuple<> &>,
+                                     void>);
         // OK: ditto
         static_assert(std::is_void_v<decltype(tt_i_iv::func0(std::declval<const int &>(), std::declval<const float &>(),
                                                              std::declval<const float &>()))>);
@@ -323,14 +326,15 @@ TEST_CASE("TemplateTask", "[core]") {
                                ttg::typelist<ttg::typelist<int>, ttg::typelist<float &&, const float &>,
                                              ttg::typelist<float &&>, ttg::typelist<float &&, const float &>,
                                              ttg::typelist<float &&, const float &>, ttg::typelist<std::tuple<> &>>{})),
-                           ttg::typelist<>>);
+                           ttg::typelist<ttg::typelist<>, ttg::typelist<>>>);
         static_assert(
             std::is_same_v<
                 decltype(compute_arg_binding_types(
                     func0, ttg::typelist<ttg::typelist<int>, ttg::typelist<float &&, const float &>,
                                          ttg::typelist<float &&, const float &>, ttg::typelist<float &&, const float &>,
                                          ttg::typelist<float &&, const float &>, ttg::typelist<std::tuple<> &>>{})),
-                ttg::typelist<int, float &&, const float &, float &&, float &&, std::tuple<> &>>);
+                ttg::typelist<ttg::typelist<void>,
+                              ttg::typelist<int, float &&, const float &, float &&, float &&, std::tuple<> &>>>);
         // voids are skipped
         static_assert(
             std::is_same_v<
@@ -339,7 +343,8 @@ TEST_CASE("TemplateTask", "[core]") {
                                          ttg::typelist<float &&, const float &>, ttg::typelist<float &&, const float &>,
                                          ttg::typelist<float &&, const float &>, ttg::typelist<float &&, const float &>,
                                          ttg::typelist<void, std::tuple<> &>, ttg::typelist<void>>{})),
-                ttg::typelist<int, void, float &&, const float &, float &&, float &&, std::tuple<> &, void>>);
+                ttg::typelist<ttg::typelist<void>, ttg::typelist<int, void, float &&, const float &, float &&, float &&,
+                                                                 std::tuple<> &, void>>>);
 
         // test introspection of generic arguments by the runtime (i.e. contents of TT::input_args_type) and
         // the deduced types inside the function body
@@ -425,10 +430,10 @@ TEST_CASE("TemplateTask", "[core]") {
     static_assert(ttg::meta::is_generic_callable_v<decltype(g)>);
     auto [f_is_generic, f_args_t_v] = ttg::meta::callable_args<decltype(f)>;
     CHECK(!f_is_generic);
-    static_assert(std::is_same_v<decltype(f_args_t_v), ttg::typelist<int &>>);
+    static_assert(std::is_same_v<decltype(f_args_t_v), std::pair<ttg::typelist<void>, ttg::typelist<int &>>>);
     auto [g_is_generic, g_args_t_v] = ttg::meta::callable_args<decltype(g)>;
     CHECK(g_is_generic);
-    static_assert(std::is_same_v<decltype(g_args_t_v), ttg::typelist<>>);
+    static_assert(std::is_same_v<decltype(g_args_t_v), std::pair<ttg::typelist<>, ttg::typelist<>>>);
 
     {
       static_assert(!ttg::meta::is_generic_callable_v<decltype(&args_pmf::X::f)>);
diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt
index 8d2213e75c..5830147dd0 100644
--- a/ttg/CMakeLists.txt
+++ b/ttg/CMakeLists.txt
@@ -17,6 +17,7 @@ set(ttg-util-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/future.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/hash.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/hash/std/pair.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/iovec.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/macro.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/meta.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/meta/callable.h
@@ -37,14 +38,22 @@ set(ttg-base-headers
 file(GLOB_RECURSE ttg-external-headers $<$<VERSION_GREATER_EQUAL:${CMAKE_VERSION},3.12>:CONFIGURE_DEPENDS>
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/external/boost/*
     )
+configure_file(
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/config.in.h
+        ${CMAKE_CURRENT_BINARY_DIR}/ttg/config.h
+)
 set(ttg-impl-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/broadcast.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/buffer.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescope.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescratch.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/edge.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/execution.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/func.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/fwd.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/impl_selector.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/tt.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/ptr.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/reduce.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/run.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/runtimes.h
@@ -53,6 +62,9 @@ set(ttg-impl-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/traverse.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/world.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/make_tt.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/make_device_tt.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/view.h
+        ${CMAKE_CURRENT_BINARY_DIR}/ttg/config.h
     )
 set(ttg-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg.h
@@ -68,6 +80,7 @@ set(ttg-sources
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/env.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/version.cc
     )
+
 # extract git metadata
 include(GetGitMetadata)
 vgkit_cmake_git_metadata()
@@ -75,16 +88,13 @@ vgkit_cmake_git_metadata()
 set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/version.cc
         PROPERTIES
         COMPILE_DEFINITIONS "TTG_GIT_REVISION=\"${TTG_GIT_REVISION}\";TTG_GIT_DESCRIPTION=\"${TTG_GIT_DESCRIPTION}\"")
-set(ttg-public-headers ${ttg-headers};${ttg-impl-headers};${ttg-base-headers};${ttg-util-headers})
-if (NOT TTG_IGNORE_BUNDLED_EXTERNALS)
-  list(APPEND ttg-sources ${ttg-external-headers})
-  list(APPEND ttg-public-headers ${ttg-external-headers})
-endif()
+
 # optional dependencies
 if (TARGET Boost::boost)
   list(APPEND ttg-deps Boost::boost)
 else () # if Boost::boost is missing must use bundled Boost.CallableTraits
   list(APPEND ttg-defs "$<BUILD_INTERFACE:TTG_USE_BUNDLED_BOOST_CALLABLE_TRAITS=1>")
+  list(APPEND ttg-incs ttg/external/)
 endif ()
 if (TARGET TTG_Libunwind)
   list(APPEND ttg-deps TTG_Libunwind)
@@ -92,6 +102,23 @@ endif(TARGET TTG_Libunwind)
 if (TTG_ENABLE_TRACE)
   list(APPEND ttg-defs "TTG_ENABLE_TRACE=1")
 endif (TTG_ENABLE_TRACE)
+if (TARGET std::coroutine)
+  list(APPEND ttg-deps std::coroutine)
+  list(APPEND ttg-defs "TTG_HAS_COROUTINE=1")
+  list(APPEND ttg-util-headers
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/coroutine.h
+          )
+endif ()
+if (TARGET CUDA::cudart)
+  list(APPEND ttg-deps CUDA::cudart)
+  list(APPEND ttg-defs "TTG_HAVE_CUDART=1")
+endif (TARGET CUDA::cudart)
+
+set(ttg-public-headers ${ttg-headers};${ttg-impl-headers};${ttg-base-headers};${ttg-util-headers})
+if (NOT TTG_IGNORE_BUNDLED_EXTERNALS)
+  list(APPEND ttg-sources ${ttg-external-headers})
+  list(APPEND ttg-public-headers ${ttg-external-headers})
+endif()
 
 add_ttg_library(ttg "${ttg-sources}" PUBLIC_HEADER "${ttg-public-headers}" LINK_LIBRARIES "${ttg-deps}" INCLUDE_DIRECTORIES "${ttg-incs}" COMPILE_DEFINITIONS "${ttg-defs}")
 
@@ -184,8 +211,13 @@ endif(TARGET MADworld)
 ########################
 if (TARGET PaRSEC::parsec)
   set(ttg-parsec-headers
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/buffer.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/devicescratch.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/fwd.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/import.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ptr.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/task.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/thread_local.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttg.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttg_data_copy.h
           )
diff --git a/ttg/ttg.h b/ttg/ttg.h
index e0fa9a7029..09ccfaf902 100644
--- a/ttg/ttg.h
+++ b/ttg/ttg.h
@@ -27,6 +27,11 @@
 
 #include "ttg/edge.h"
 
+#include "ttg/ptr.h"
+#include "ttg/buffer.h"
+#include "ttg/devicescratch.h"
+#include "ttg/devicescope.h"
+
 #if defined(TTG_USE_PARSEC)
 #include "ttg/parsec/ttg.h"
 #elif defined(TTG_USE_MADNESS)
diff --git a/ttg/ttg/base/tt.h b/ttg/ttg/base/tt.h
index e59db15e67..4fa975b40e 100644
--- a/ttg/ttg/base/tt.h
+++ b/ttg/ttg/base/tt.h
@@ -144,11 +144,11 @@ namespace ttg {
     virtual ~TTBase() = default;
 
     /// Use this to create a task that takes no data "manually"
-    /// @warning calls std::abort() if the derived class TT did not override this;
+    /// @warning calls ttg::abort() if the derived class TT did not override this;
     ///          only makes sense to override this if the derived TT uses void for key or data
     virtual void invoke() {
       std::cerr << "TTBase::invoke() invoked on a TT that did not override it" << std::endl;
-      abort();
+      ttg::abort();
     }
 
     /// Sets trace for all operations to value and returns previous setting.
diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h
new file mode 100644
index 0000000000..1868b7e0c8
--- /dev/null
+++ b/ttg/ttg/buffer.h
@@ -0,0 +1,31 @@
+#ifndef TTG_BUFFER_H
+#define TTG_BUFFER_H
+
+#include <memory>
+#include "ttg/impl_selector.h"
+
+
+namespace ttg {
+
+template<typename T>
+using buffer = TTG_IMPL_NS::buffer<T>;
+
+namespace detail {
+  template<typename T>
+  struct is_buffer : std::false_type
+  { };
+
+  template<typename T>
+  struct is_buffer<ttg::buffer<T>> : std::true_type
+  { };
+
+  template<typename T>
+  constexpr bool is_buffer_v = is_buffer<T>::value;
+
+  static_assert(is_buffer_v<ttg::buffer<double>>);
+  static_assert(is_buffer_v<TTG_IMPL_NS::buffer<double>>);
+} // namespace detail
+
+} // namespace ttg
+
+#endif // TTG_buffer_H
\ No newline at end of file
diff --git a/ttg/ttg/config.in.h b/ttg/ttg/config.in.h
new file mode 100644
index 0000000000..8f208c6115
--- /dev/null
+++ b/ttg/ttg/config.in.h
@@ -0,0 +1,16 @@
+//
+// Created by Eduard Valeyev on 10/31/22.
+//
+
+#ifndef TTG_CONFIG_IN_H
+#define TTG_CONFIG_IN_H
+
+/** the C++ header containing the coroutine API */
+#define TTG_CXX_COROUTINE_HEADER <@CXX_COROUTINE_HEADER@>
+
+/** the C++ namespace containing the coroutine API */
+#define TTG_CXX_COROUTINE_NAMESPACE @CXX_COROUTINE_NAMESPACE@
+
+#cmakedefine TTG_HAVE_CUDA
+
+#endif  // TTG_CONFIG_IN_H
diff --git a/ttg/ttg/device.h b/ttg/ttg/device.h
new file mode 100644
index 0000000000..d1338bd222
--- /dev/null
+++ b/ttg/ttg/device.h
@@ -0,0 +1,14 @@
+#ifndef TTG_DEVICE_H
+#define TTG_DEVICE_H
+
+#include "ttg/fwd.h"
+#include "ttg/execution.h"
+
+namespace ttg {
+    namespace device {
+        using DeviceAllocator = TTG_IMPL_NS::device::DeviceAllocator;
+        std::size_t nb_devices() { return TTG_IMPL_NS::device::nb_devices(); }
+    }
+}
+
+#endif /* TTG_DEVICE_H */
\ No newline at end of file
diff --git a/ttg/ttg/device/send.h b/ttg/ttg/device/send.h
new file mode 100644
index 0000000000..d125c87f2c
--- /dev/null
+++ b/ttg/ttg/device/send.h
@@ -0,0 +1,26 @@
+#ifndef TTG_DEVICE_SEND_H
+#define TTG_DEVICE_SEND_H
+
+
+namespace ttg {
+
+  namespace detail {
+
+    /* structure holding references to the key and value, awaitable */
+    template<typename Key, typename Value>
+    struct await_send {
+        const Key& m_key;
+        Value& m_value;
+
+        await_send(const Key& key, Value& val)
+        : m_key(key)
+        , m_value(val)
+        { }
+    };
+
+  } // namespace ttg
+
+} // namespace ttg
+
+
+#endif // TTG_DEVICE_SEND_H
\ No newline at end of file
diff --git a/ttg/ttg/devicescope.h b/ttg/ttg/devicescope.h
new file mode 100644
index 0000000000..594e6db0b3
--- /dev/null
+++ b/ttg/ttg/devicescope.h
@@ -0,0 +1,11 @@
+#ifndef TTG_DEVICESCOPE_H
+#define TTG_DEVICESCOPE_H
+
+namespace ttg {
+  enum class scope {
+    Allocate     = 0x0,  //< memory allocated as scratch, but not moved in or out
+    SyncIn       = 0x2,  //< memory allocated as scratch and data transferred to device
+  };
+} // namespace ttg
+
+#endif // TTG_DEVICESCOPE_H
\ No newline at end of file
diff --git a/ttg/ttg/devicescratch.h b/ttg/ttg/devicescratch.h
new file mode 100644
index 0000000000..37510436fd
--- /dev/null
+++ b/ttg/ttg/devicescratch.h
@@ -0,0 +1,34 @@
+#ifndef TTG_DEVICESCRATCH_H
+#define TTG_DEVICESCRATCH_H
+
+#include "ttg/devicescope.h"
+#include "ttg/impl_selector.h"
+
+namespace ttg {
+
+template<typename T>
+using devicescratch = TTG_IMPL_NS::devicescratch<T>;
+
+template<typename T>
+auto make_scratch(T* val, ttg::scope scope, std::size_t count = 1) {
+  return devicescratch<T>(val, scope, 1);
+}
+
+namespace detail {
+
+  template<typename T>
+  struct is_devicescratch : std::false_type
+  { };
+
+  template<typename T>
+  struct is_devicescratch<ttg::devicescratch<T>> : std::true_type
+  { };
+
+  template<typename T>
+  constexpr bool is_devicescratch_v = is_devicescratch<T>::value;
+
+} // namespace detail
+
+} // namespace ttg
+
+#endif // TTG_DEVICESCRATCH_H
\ No newline at end of file
diff --git a/ttg/ttg/func.h b/ttg/ttg/func.h
index 61615e6314..f647734dd1 100644
--- a/ttg/ttg/func.h
+++ b/ttg/ttg/func.h
@@ -515,7 +515,7 @@ namespace ttg {
 
   template <std::size_t i>
   inline void set_size(const std::size_t size) {
-    set_size<i>(size);
+    set_size(i, size);
   }
 
   /// \brief Finalize streaming input terminals connecting to the given output terminal for tasks
@@ -570,7 +570,7 @@ namespace ttg {
 
   template <std::size_t i>
   inline void finalize() {
-    finalize<i>();
+    finalize(i);
   }
 
 }  // namespace ttg
diff --git a/ttg/ttg/fwd.h b/ttg/ttg/fwd.h
index df32505d04..f9b8d1c0f8 100644
--- a/ttg/ttg/fwd.h
+++ b/ttg/ttg/fwd.h
@@ -47,6 +47,7 @@ namespace ttg {
   template <typename... RestOfArgs>
   void initialize(int argc, char **argv, int num_threads = -1, RestOfArgs &&...);
   void finalize();
+  [[noreturn]]
   void abort();
   World default_execution_context();
   void execute(ttg::World world);
diff --git a/ttg/ttg/madness/fwd.h b/ttg/ttg/madness/fwd.h
index abcb771c54..469a251d12 100644
--- a/ttg/ttg/madness/fwd.h
+++ b/ttg/ttg/madness/fwd.h
@@ -45,6 +45,11 @@ namespace ttg_madness {
   template <typename T>
   inline void ttg_broadcast(ttg::World world, T &data, int source_rank);
 
+  namespace device {
+    class DeviceAllocator;
+    std::size_t nb_devices();
+  }
+
 }  // namespace ttg_madness
 
 #endif  // TTG_MADNESS_FWD_H
diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h
index 58373d5edd..6cd484d4a1 100644
--- a/ttg/ttg/madness/ttg.h
+++ b/ttg/ttg/madness/ttg.h
@@ -22,7 +22,11 @@
 #include "ttg/util/meta.h"
 #include "ttg/util/meta/callable.h"
 #include "ttg/util/void.h"
+#include "ttg/view.h"
 #include "ttg/world.h"
+#ifdef TTG_HAS_COROUTINE
+#include "ttg/util/coroutine.h"
+#endif
 
 #include <array>
 #include <cassert>
@@ -273,6 +277,9 @@ namespace ttg_madness {
       derivedT *derived;                            // Pointer to derived class instance
       bool pull_terminals_invoked = false;
       std::conditional_t<ttg::meta::is_void_v<keyT>, ttg::Void, keyT> key;  // Task key
+#ifdef TTG_HAS_COROUTINE
+      void *suspended_task_address = nullptr;  // if not null the function is suspended
+#endif
 
       /// makes a tuple of references out of tuple of
       template <typename Tuple, std::size_t... Is>
@@ -297,28 +304,83 @@ namespace ttg_madness {
       }
 
       virtual void run(::madness::World &world) override {
-        // ttg::print("starting task");
-
         using ttg::hash;
         ttT::threaddata.key_hash = hash<decltype(key)>{}(key);
         ttT::threaddata.call_depth++;
 
-        if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-          derived->op(key, this->make_input_refs(),
-                      derived->output_terminals);  // !!! NOTE converting input values to refs
-        } else if constexpr (!ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-          derived->op(key, derived->output_terminals);
-        } else if constexpr (ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-          derived->op(this->make_input_refs(),
-                      derived->output_terminals);  // !!! NOTE converting input values to refs
-        } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-          derived->op(derived->output_terminals);
-        } else
-          abort();
+        void *suspended_task_address =
+#ifdef TTG_HAS_COROUTINE
+            this->suspended_task_address;  // non-null = need to resume the task
+#else
+            nullptr;
+#endif
+        if (suspended_task_address == nullptr) {  // task is a coroutine that has not started or an ordinary function
+          // ttg::print("starting task");
+          if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+            TTG_PROCESS_TT_OP_RETURN(
+                suspended_task_address,
+                derived->op(key, this->make_input_refs(),
+                            derived->output_terminals));  // !!! NOTE converting input values to refs
+          } else if constexpr (!ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+            TTG_PROCESS_TT_OP_RETURN(suspended_task_address, derived->op(key, derived->output_terminals));
+          } else if constexpr (ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+            TTG_PROCESS_TT_OP_RETURN(
+                suspended_task_address,
+                derived->op(this->make_input_refs(),
+                            derived->output_terminals));  // !!! NOTE converting input values to refs
+          } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+            TTG_PROCESS_TT_OP_RETURN(suspended_task_address, derived->op(derived->output_terminals));
+          } else  // unreachable
+            ttg::abort();
+        } else {  // resume suspended coroutine
+#ifdef TTG_HAS_COROUTINE
+          auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<>::from_address(suspended_task_address));
+          assert(ret.ready());
+          ret.resume();
+          if (ret.completed()) {
+            ret.destroy();
+            suspended_task_address = nullptr;
+          } else {  // not yet completed
+            // leave suspended_task_address as is
+          }
+          this->suspended_task_address = suspended_task_address;
+#else
+          ttg::abort();  // should not happen
+#endif
+        }
 
         ttT::threaddata.call_depth--;
 
-        // ttg::print("finishing task",ttT::threaddata.call_depth);
+        // if (suspended_task_address == nullptr) {
+        //   ttg::print("finishing task",ttT::threaddata.call_depth);
+        // }
+
+#ifdef TTG_HAS_COROUTINE
+        if (suspended_task_address) {
+          // TODO implement handling of suspended coroutines properly
+
+          // right now can events are not properly implemented, we are only testing the workflow with dummy events
+          // so mark the events finished manually, parsec will rerun this task again and it should complete the second
+          // time
+          auto events =
+              static_cast<ttg::resumable_task>(ttg::coroutine_handle<>::from_address(suspended_task_address)).events();
+          for (auto &event_ptr : events) {
+            event_ptr->finish();
+          }
+          assert(ttg::coroutine_handle<>::from_address(suspended_task_address).promise().ready());
+
+          // resume the coroutine
+          auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<>::from_address(suspended_task_address));
+          assert(ret.ready());
+          ret.resume();
+          if (ret.completed()) {
+            ret.destroy();
+            suspended_task_address = nullptr;
+          } else {  // not yet completed
+            ttg::abort();
+          }
+        }
+#endif  // TTG_HAS_COROUTINE
       }
 
       virtual ~TTArgs() {}  // Will be deleted via TaskInterface*
@@ -564,7 +626,7 @@ namespace ttg_madness {
             } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
               static_cast<derivedT *>(this)->op(output_terminals);  // Runs immediately
             } else
-              abort();
+              ttg::abort();
             ttT::threaddata.call_depth--;
 
           } else {
@@ -944,7 +1006,7 @@ namespace ttg_madness {
         auto finalize_callback = [this]() { finalize_argstream<i>(); };
         input.set_callback(send_callback, send_callback, {}, setsize_callback, finalize_callback);
       } else
-        abort();
+        ttg::abort();
     }
 
     template <std::size_t... IS>
@@ -1070,7 +1132,7 @@ namespace ttg_madness {
           for (std::size_t i = 0; i < numins; i++) std::cerr << (item.second->nargs[i] == 0 ? "T" : "F") << " ";
           std::cerr << ")" << std::endl;
         }
-        abort();
+        ttg::abort();
       }
     }
 
@@ -1215,6 +1277,28 @@ namespace ttg_madness {
 
 #include "ttg/make_tt.h"
 
+   namespace device {
+    class DeviceAllocator {
+      public:
+        DeviceAllocator(int did) { 
+          if(did != 0) {
+            throw std::out_of_range("TTG MADNESS Backend: current implementation only supports CPU devices")
+          }
+        }
+        void *allocate(std::size_t size) {
+          return ::malloc(size);
+        };
+        void  free(void *ptr) {
+          ::free(ptr);
+        }
+        ::ttg::ExecutionSpace executionSpace() {
+          return ::ttg::ExecutionSpace::Host;
+        }
+    };
+
+    std::size_t nb_devices() { return 1; }
+  }
+
 }  // namespace ttg_madness
 
 #include "ttg/madness/watch.h"
diff --git a/ttg/ttg/make_device_tt.h b/ttg/ttg/make_device_tt.h
new file mode 100644
index 0000000000..09f096d838
--- /dev/null
+++ b/ttg/ttg/make_device_tt.h
@@ -0,0 +1,281 @@
+// clang-format off
+#ifndef TTG_MAKE_DEVICE_TT_H
+#define TTG_MAKE_DEVICE_TT_H
+
+// to be #include'd within runtime::ttg namespace
+
+
+namespace detail {
+#ifdef TTG_HAVE_CUDA
+  inline thread_local cudaStream_t* ts_stream = nullptr;
+#endif // TTG_HAVE_CUDA
+
+  template<typename FuncT, typename keyT, typename... Args, std::size_t... Is>
+  inline void invoke_with_unpacked_views(FuncT&& func, const keyT& key, std::tuple<Args...>& views, std::index_sequence<Is...>) {
+#ifdef TTG_HAVE_CUDA
+    func(key, std::get<Is>(views)..., ts_stream);
+#else // TTG_HAVE_CUDA
+    func(key, std::get<Is>(views)...);
+#endif // TTG_HAVE_CUDA
+  }
+
+  /* TODO: extract host objects from views */
+  template<typename ViewTupleT>
+  struct host_obj_type;
+
+  template<typename... ViewTs>
+  struct host_obj_type<std::tuple<ViewTs...>> {
+    using type = std::tuple<typename ViewTs::host_type...>;
+  };
+
+  template<typename ViewTupleT>
+  using host_obj_type_t = typename host_obj_type<ViewTupleT>::type;
+
+  template<typename FuncT, typename keyT, typename... ViewTs, typename  std::size_t... Is>
+  inline void invoke_out_with_unpacked_views(FuncT&& func, const keyT& key, std::tuple<ViewTs...> views, std::index_sequence<Is...>) {
+    func(key, std::get<Is>(views).get_host_object()...);
+  }
+
+  template<typename HostT, typename... DevTs, std::size_t I, std::size_t... Is>
+  inline void create_view_on_device(const ttg::View<HostT, DevTs...>& view,
+                                    std::tuple<ttg::ViewSpan<DevTs>...>& dev_spans,
+                                    std::index_sequence<I, Is...>) {
+
+    /* fill in pointers for the device -- we're relying on managed memory for this simple wrapper */
+    typename std::tuple_element_t<I, typename ttg::View<HostT, DevTs...>::span_tuple_type>::element_type *ptr;
+    size_t size;
+    ptr = view.template get_device_ptr<I>();
+    size = view.template get_device_size<I>();
+    //cudaMalloc(&ptr, span.size_bytes());
+    std::get<I>(dev_spans) = ttg::ViewSpan(ptr, size, view.template get_scope<I>());
+
+    /* copy data to device */
+    //cudaMemcpy(ptr, span.data(), span.size_bytes(), cudaMemcpyHostToDevice);
+    if (view.template get_span<I>().is_sync_in()) {
+#if defined(TTG_HAVE_CUDA) && defined(TTG_USE_CUDA_PREFETCH)
+      cudaMemPrefetchAsync(span.data(), span.size_bytes(), 0, *ts_stream);
+#endif // TTG_USE_CUDA_PREFETCH
+    }
+
+    if constexpr(sizeof...(Is) > 0) {
+      create_view_on_device(view, dev_spans, std::index_sequence<Is...>{});
+    }
+  }
+
+  template<typename HostT, typename... ViewSpanTs, std::size_t... Is>
+  auto make_view_from_tuple(HostT& obj, std::tuple<ttg::ViewSpan<ViewSpanTs>...>& spans, std::index_sequence<Is...>) {
+    return ttg::make_view(obj, std::get<Is>(spans)...);
+  }
+
+  template<typename... ViewTs, std::size_t I, std::size_t... Is>
+  inline void create_on_device(std::tuple<ViewTs...>& views, std::tuple<ViewTs...>& dev_views, std::index_sequence<I, Is...>) {
+
+    using view_tuple_t = typename std::tuple<ViewTs...>;
+    auto& view = std::get<I>(views);
+    typename std::tuple_element_t<I, view_tuple_t>::span_tuple_type dev_spans;
+    create_view_on_device(view, dev_spans, std::make_index_sequence<std::tuple_element_t<I, view_tuple_t>::size()>());
+
+    /* set the view for the device */
+    std::get<I>(dev_views) = make_view_from_tuple(view.get_host_object(), dev_spans, std::make_index_sequence<std::tuple_size_v<decltype(dev_spans)>>{});
+    if constexpr(sizeof...(Is) > 0) {
+      create_on_device(views, dev_views, std::index_sequence<Is...>{});
+    }
+  }
+
+  template<typename HostT, typename... DevTs, std::size_t I, std::size_t... Is>
+  inline void sync_view_to_host(ttg::View<HostT, DevTs...>& dev_view, std::index_sequence<I, Is...>) {
+    /* prefetch back to host */
+    auto span = dev_view.template get_span<I>();
+
+    /* prefetch data from device */
+    if (span.is_sync_out()) {
+#if defined(TTG_HAVE_CUDA) && defined(TTG_USE_CUDA_PREFETCH)
+      cudaMemPrefetchAsync(span.data(), span.size_bytes(), cudaCpuDeviceId, *ts_stream);
+#endif // TTG_USE_CUDA_PREFETCH
+    }
+
+    if constexpr(sizeof...(Is) > 0) {
+      sync_view_to_host(dev_view, std::index_sequence<Is...>{});
+    }
+  }
+
+  template<typename... ViewTs, std::size_t I, std::size_t... Is>
+  inline void sync_back_to_host(std::tuple<ViewTs...>& dev_views, std::index_sequence<I, Is...>) {
+
+    sync_view_to_host(std::get<I>(dev_views), std::make_index_sequence<std::tuple_element_t<I, std::tuple<ViewTs...>>::size()>());
+
+    if constexpr(sizeof...(Is) > 0) {
+      sync_back_to_host(dev_views, std::index_sequence<Is...>{});
+    }
+  }
+
+  template<typename keyT,
+           bool have_outterm_tuple,
+           typename DevViewFuncT,
+           typename DevKernelFuncT,
+           typename DevOutFuncT,
+           typename... input_edge_valuesT,
+           typename... output_edgesT,
+           typename... Args>
+  auto make_device_tt_helper(DevViewFuncT &&view_func,
+                             DevKernelFuncT &&kernel_func,
+                             DevOutFuncT &&out_func,
+                             ttg::ExecutionSpace space,
+                             const std::tuple<ttg::Edge<keyT, input_edge_valuesT>...> &inedges,
+                             const std::tuple<output_edgesT...> &outedges,
+                             const std::string &name,
+                             const std::vector<std::string> &innames,
+                             const std::vector<std::string> &outnames,
+                             const ttg::typelist<Args...>& full_input_args) {
+
+    using output_terminals_type = typename ttg::edges_to_output_terminals<std::tuple<output_edgesT...>>::type;
+
+    auto taskfn = [=](const keyT& key, Args... args) mutable {
+
+#ifdef TTG_HAVE_CUDA
+      if (nullptr == ts_stream) {
+        ts_stream = new cudaStream_t();
+        cudaStreamCreate(ts_stream);
+      }
+#endif // TTG_HAVE_CUDA
+
+      auto views = view_func(key, std::forward<Args>(args)...);
+      using view_tuple_t = std::remove_reference_t<decltype(views)>;
+      constexpr std::size_t view_tuple_size = std::tuple_size_v<view_tuple_t>;
+      /* 1) allocate memory on device */
+      auto device_views = views;
+      /* 2) move data from views to device */
+      if constexpr(std::tuple_size_v<view_tuple_t> > 0) {
+        create_on_device(views, device_views, std::make_index_sequence<view_tuple_size>());
+      }
+      /* 3) call kernel function */
+      detail::invoke_with_unpacked_views(kernel_func, key, device_views, std::make_index_sequence<view_tuple_size>());
+      /* 4) move data back out into host objects */
+      if constexpr(std::tuple_size_v<view_tuple_t> > 0) {
+        sync_back_to_host(device_views, std::make_index_sequence<view_tuple_size>());
+      }
+  #ifdef TTG_HAVE_CUDA
+      /* wait for the */
+      cudaStreamSynchronize(*ts_stream);
+  #endif // TTG_HAVE_CUDA
+      /* 5) call output function */
+      detail::invoke_out_with_unpacked_views(out_func, key, views, std::make_index_sequence<view_tuple_size>());
+    };
+
+  using wrapT = typename CallableWrapTTArgsAsTypelist<decltype(taskfn), void, have_outterm_tuple, keyT, output_terminals_type,
+                                                      ttg::typelist<Args...>>::type;
+
+  return std::make_unique<wrapT>(std::move(taskfn), inedges, outedges, name, innames, outnames);
+
+  }
+
+
+} // namespace detail
+
+
+template <typename keyT,
+          typename DevViewFuncT,
+          typename DevKernelFuncT,
+          typename DevOutFuncT,
+          typename... input_edge_valuesT,
+          typename... output_edgesT>
+auto make_device_tt(DevViewFuncT &&view_func,
+                    DevKernelFuncT &&kernel_func,
+                    DevOutFuncT &&out_func,
+                    ttg::ExecutionSpace space,
+                    const std::tuple<ttg::Edge<keyT, input_edge_valuesT>...> &inedges,
+                    const std::tuple<output_edgesT...> &outedges, const std::string &name = "wrapper",
+                    const std::vector<std::string> &innames = std::vector<std::string>(sizeof...(input_edge_valuesT), "input"),
+                    const std::vector<std::string> &outnames = std::vector<std::string>(sizeof...(output_edgesT), "output")) {
+
+  using output_terminals_type = typename ttg::edges_to_output_terminals<std::tuple<output_edgesT...>>::type;
+
+  constexpr auto void_key = ttg::meta::is_void_v<keyT>;
+
+  // gross list of candidate argument types
+  using gross_candidate_func_args_t = ttg::meta::typelist<
+      ttg::meta::candidate_argument_bindings_t<std::add_const_t<keyT>>,
+      ttg::meta::candidate_argument_bindings_t<typename ttg::Edge<keyT, input_edge_valuesT>::value_type>...,
+      ttg::meta::typelist<output_terminals_type &, void>>;
+
+  // net list of candidate argument types excludes the empty typelists for void arguments
+  using candidate_func_args_t = ttg::meta::filter_t<gross_candidate_func_args_t, ttg::meta::typelist_is_not_empty>;
+
+  // gross argument typelist for invoking func, can include void for optional args
+  constexpr static auto func_is_generic = ttg::meta::is_generic_callable_v<DevViewFuncT>;
+  using gross_func_args_t = decltype(ttg::meta::compute_arg_binding_types_r<void>(view_func, candidate_func_args_t{}));
+  constexpr auto DETECTED_HOW_TO_INVOKE_GENERIC_FUNC =
+      func_is_generic ? !std::is_same_v<gross_func_args_t, ttg::typelist<>> : true;
+  static_assert(DETECTED_HOW_TO_INVOKE_GENERIC_FUNC,
+                "ttd::make_tt(func, inedges, ...): could not detect how to invoke generic callable func, either the "
+                "signature of func "
+                "is faulty, or inedges does match the expected list of types, or both");
+
+  // net argument typelist
+  using func_args_t = ttg::meta::drop_void_t<gross_func_args_t>;
+  constexpr auto num_args = std::tuple_size_v<func_args_t>;
+
+  // if given task id, make sure it's passed via const lvalue ref
+  constexpr bool TASK_ID_PASSED_AS_CONST_LVALUE_REF =
+      !void_key ? ttg::meta::probe_first_v<ttg::meta::is_const_lvalue_reference, true, func_args_t> : true;
+  static_assert(TASK_ID_PASSED_AS_CONST_LVALUE_REF,
+                "ttg::make_tt(func, ...): if given to func, the task id must be passed by const lvalue ref");
+
+  // if given out-terminal tuple, make sure it's passed via nonconst lvalue ref
+  constexpr bool have_outterm_tuple =
+      func_is_generic ? !ttg::meta::is_last_void_v<gross_func_args_t>
+                      : ttg::meta::probe_last_v<ttg::meta::decays_to_output_terminal_tuple, false, gross_func_args_t>;
+  constexpr bool OUTTERM_TUPLE_PASSED_AS_NONCONST_LVALUE_REF =
+      have_outterm_tuple ? ttg::meta::probe_last_v<ttg::meta::is_nonconst_lvalue_reference, false, func_args_t> : true;
+  static_assert(
+      OUTTERM_TUPLE_PASSED_AS_NONCONST_LVALUE_REF,
+      "ttg::make_tt(func, ...): if given to func, the output terminal tuple must be passed by nonconst lvalue ref");
+
+  // TT needs actual types of arguments to func ... extract them and pass to CallableWrapTTArgs
+  using input_edge_value_types = ttg::meta::typelist<std::decay_t<input_edge_valuesT>...>;
+  // input_args_t = {input_valuesT&&...}
+  using input_args_t = typename ttg::meta::take_first_n<
+      typename ttg::meta::drop_first_n<func_args_t, std::size_t(void_key ? 0 : 1)>::type,
+      std::tuple_size_v<func_args_t> - (void_key ? 0 : 1) - (have_outterm_tuple ? 1 : 0)>::type;
+  constexpr auto NO_ARGUMENTS_PASSED_AS_NONCONST_LVALUE_REF =
+      !ttg::meta::is_any_nonconst_lvalue_reference_v<input_args_t>;
+  static_assert(
+      NO_ARGUMENTS_PASSED_AS_NONCONST_LVALUE_REF,
+      "ttg::make_tt(func, inedges, outedges): one or more arguments to func can only be passed by nonconst lvalue "
+      "ref; this is illegal, should only pass arguments as const lavlue ref or (nonconst) rvalue ref");
+  using decayed_input_args_t = ttg::meta::decayed_typelist_t<input_args_t>;
+  // 3. full_input_args_t = edge-types with non-void types replaced by input_args_t
+  using full_input_args_t = ttg::meta::replace_nonvoid_t<input_edge_value_types, input_args_t>;
+
+  return detail::make_device_tt_helper<keyT, have_outterm_tuple>(std::forward<DevViewFuncT>(view_func),
+                                                                 std::forward<DevKernelFuncT>(kernel_func),
+                                                                 std::forward<DevOutFuncT>(out_func),
+                                                                 space, inedges, outedges, name, innames, outnames,
+                                                                 full_input_args_t{});
+}
+
+#if 0
+template <typename keyT,
+          typename HostFuncT,
+          typename DevViewFuncT,
+          typename DevKernelFuncT,
+          typename DevOutFuncT,
+          typename... input_edge_valuesT,
+          typename... output_edgesT>
+auto make_device_tt(HostFuncT &&host_func,
+                    DevViewFuncT &&view_func,
+                    DevKernelFuncT &&kernel_func,
+                    DevOutFuncT &&out_func,
+                    ttg::ExecutionSpace space,
+                    const std::tuple<ttg::Edge<keyT, input_edge_valuesT>...> &inedges,
+                    const std::tuple<output_edgesT...> &outedges, const std::string &name = "wrapper",
+                    const std::vector<std::string> &innames = std::vector<std::string>(sizeof...(input_edge_valuesT), "input"),
+                    const std::vector<std::string> &outnames = std::vector<std::string>(sizeof...(output_edgesT), "output")) {
+
+  /* drop the host function */
+  return make_device_tt(view_func, kernel_func, out_func, space, inedges, outedges, name, innames, outnames);
+}
+#endif // 0
+#endif // TTG_MAKE_DEVICE_TT_H
+
+// clang-format on
diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h
index 79da690df4..6ae210cef6 100644
--- a/ttg/ttg/make_tt.h
+++ b/ttg/ttg/make_tt.h
@@ -8,7 +8,8 @@
 // case 1 (keyT != void): void op(auto&& key, std::tuple<input_valuesT...>&&, std::tuple<output_terminalsT...>&)
 // case 2 (keyT == void): void op(std::tuple<input_valuesT...>&&, std::tuple<output_terminalsT...>&)
 //
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
+template <typename funcT, bool funcT_receives_outterm_tuple,
+          typename keyT, typename output_terminalsT,
           typename... input_valuesT>
 class CallableWrapTT
     : public TT<keyT, output_terminalsT,
@@ -100,15 +101,19 @@ class CallableWrapTT
   }
 };
 
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
+template <typename funcT, bool funcT_receives_outterm_tuple,
+          typename keyT, typename output_terminalsT,
           typename input_values_tupleT>
 struct CallableWrapTTUnwrapTypelist;
 
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
+template <typename funcT, bool funcT_receives_outterm_tuple,
+          typename keyT, typename output_terminalsT,
           typename... input_valuesT>
-struct CallableWrapTTUnwrapTypelist<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+struct CallableWrapTTUnwrapTypelist<funcT, funcT_receives_outterm_tuple,
+                                    keyT, output_terminalsT,
                                     std::tuple<input_valuesT...>> {
-  using type = CallableWrapTT<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+  using type = CallableWrapTT<funcT, funcT_receives_outterm_tuple,
+                              keyT, output_terminalsT,
                               std::remove_reference_t<input_valuesT>...>;
 };
 
@@ -122,15 +127,17 @@ struct CallableWrapTTUnwrapTypelist<funcT, funcT_receives_outterm_tuple, keyT, o
 
 // Class to wrap a callable with signature
 //
-// case 1 (keyT != void): void op(auto&& key, input_valuesT&&..., std::tuple<output_terminalsT...>&)
-// case 2 (keyT == void): void op(input_valuesT&&..., std::tuple<output_terminalsT...>&)
+// case 1 (keyT != void): returnT op(auto&& key, input_valuesT&&..., std::tuple<output_terminalsT...>&)
+// case 2 (keyT == void): returnT op(input_valuesT&&..., std::tuple<output_terminalsT...>&)
 //
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
-          typename... input_valuesT>
+// returnT is void for funcT = synchronous (ordinary) function and the appropriate return type for funcT=coroutine
+template <typename funcT, typename returnT, bool funcT_receives_outterm_tuple, ttg::ExecutionSpace space,
+          typename keyT, typename output_terminalsT, typename... input_valuesT>
 class CallableWrapTTArgs
-    : public TT<keyT, output_terminalsT,
-                CallableWrapTTArgs<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT, input_valuesT...>,
-                ttg::typelist<input_valuesT...>> {
+    : public TT<
+          keyT, output_terminalsT,
+          CallableWrapTTArgs<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT, input_valuesT...>,
+          ttg::typelist<input_valuesT...>> {
   using baseT = typename CallableWrapTTArgs::ttT;
 
   using input_values_tuple_type = typename baseT::input_values_tuple_type;
@@ -141,18 +148,93 @@ class CallableWrapTTArgs
   using noref_funcT = std::remove_reference_t<funcT>;
   std::conditional_t<std::is_function_v<noref_funcT>, std::add_pointer_t<noref_funcT>, noref_funcT> func;
 
+  using op_return_type =
+#ifdef TTG_HAS_COROUTINE
+      std::conditional_t<std::is_same_v<returnT, ttg::resumable_task>,
+                         ttg::coroutine_handle<>,
+                         std::conditional_t<std::is_same_v<returnT, ttg::device_task>,
+                                            ttg::device_task::base_type,
+                                            void>>;
+#else   // TTG_HAS_COROUTINE
+      void;
+#endif  // TTG_HAS_COROUTINE
+
+public:
+  static constexpr bool have_cuda_op = (space == ttg::ExecutionSpace::CUDA);
+
+protected:
+
+  /// @return coroutine handle<> (if funcT is a coroutine), else void
   template <typename Key, typename Tuple, std::size_t... S>
-  void call_func(Key &&key, Tuple &&args_tuple, output_terminalsT &out, std::index_sequence<S...>) {
+  auto call_func(Key &&key, Tuple &&args_tuple, output_terminalsT &out, std::index_sequence<S...>) {
     using func_args_t = ttg::meta::tuple_concat_t<std::tuple<const Key &>, input_refs_tuple_type, output_edges_type>;
-    if constexpr (funcT_receives_outterm_tuple)
-      func(std::forward<Key>(key),
-           baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
-    else {
+
+    auto process_return = [&out](auto &&ret) {
+      static_assert(std::is_same_v<std::remove_reference_t<decltype(ret)>, returnT>,
+                    "CallableWrapTTArgs<funcT,returnT,...>: returnT does not match the actual return type of funcT");
+      if constexpr (!std::is_void_v<returnT>) {  // protect from compiling for void returnT
+#ifdef TTG_HAS_COROUTINE
+        if constexpr (std::is_same_v<returnT, ttg::resumable_task>) {
+          ttg::coroutine_handle<> coro_handle;
+          // if task completed destroy it
+          if (ret.completed()) {
+            ret.destroy();
+          } else {  // if task is suspended return the coroutine promise ptr
+            coro_handle = ret;
+          }
+          return coro_handle;
+        } else if constexpr (std::is_same_v<returnT, ttg::device_task>) {
+          ttg::device_task::base_type coro_handle = ret;
+          return coro_handle;
+        }
+        if constexpr (!(std::is_same_v<returnT, ttg::resumable_task> || std::is_same_v<returnT, ttg::device_task>))
+#endif
+        {
+          static_assert(std::tuple_size_v<std::remove_reference_t<decltype(out)>> == 1,
+                        "CallableWrapTTArgs<funcT,returnT,funcT_receives_outterm_tuple=true,...): funcT can return a "
+                        "value only if there is only 1 out terminal");
+          static_assert(std::tuple_size_v<returnT> <= 2,
+                        "CallableWrapTTArgs<funcT,returnT,funcT_receives_outterm_tuple=true,...): funcT can return a "
+                        "value only if it is a plain value (then sent with null key), a tuple-like containing a single "
+                        "key (hence value is void), or a tuple-like containing a key and a value");
+          if constexpr (std::tuple_size_v<returnT> == 0)
+            std::get<0>(out).sendv(std::move(ret));
+          else if constexpr (std::tuple_size_v<returnT> == 1)
+            std::get<0>(out).sendk(std::move(std::get<0>(ret)));
+          else if constexpr (std::tuple_size_v<returnT> == 2)
+            std::get<0>(out).send(std::move(std::get<0>(ret)), std::move(std::get<1>(ret)));
+          return;
+        }
+      }
+    };
+
+    if constexpr (funcT_receives_outterm_tuple) {
+      if constexpr (std::is_void_v<returnT>) {
+        func(std::forward<Key>(key),
+             baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
+        return;
+      } else {
+        auto ret = func(
+            std::forward<Key>(key),
+            baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
+
+        return process_return(std::move(ret));
+      }
+    } else {
       auto old_output_tls_ptr = this->outputs_tls_ptr_accessor();
       this->set_outputs_tls_ptr();
-      func(std::forward<Key>(key),
-           baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))...);
-      this->set_outputs_tls_ptr(old_output_tls_ptr);
+      if constexpr (std::is_void_v<returnT>) {
+        func(std::forward<Key>(key),
+             baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))...);
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+        return;
+      } else {
+        auto ret =
+            func(std::forward<Key>(key),
+                 baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))...);
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+        return process_return(std::move(ret));
+      }
     }
   }
 
@@ -214,54 +296,55 @@ class CallableWrapTTArgs
   template <typename Key, typename ArgsTuple>
   std::enable_if_t<std::is_same_v<ArgsTuple, input_refs_tuple_type> &&
                        !ttg::meta::is_empty_tuple_v<input_refs_tuple_type> && !ttg::meta::is_void_v<Key>,
-                   void>
+                   op_return_type>
   op(Key &&key, ArgsTuple &&args_tuple, output_terminalsT &out) {
     assert(&out == &baseT::get_output_terminals());
-    call_func(std::forward<Key>(key), std::forward<ArgsTuple>(args_tuple), out,
-              std::make_index_sequence<std::tuple_size_v<ArgsTuple>>{});
+    return call_func(std::forward<Key>(key), std::forward<ArgsTuple>(args_tuple), out,
+                     std::make_index_sequence<std::tuple_size_v<ArgsTuple>>{});
   };
 
   template <typename ArgsTuple, typename Key = keyT>
   std::enable_if_t<std::is_same_v<ArgsTuple, input_refs_tuple_type> &&
                        !ttg::meta::is_empty_tuple_v<input_refs_tuple_type> && ttg::meta::is_void_v<Key>,
-                   void>
+                   op_return_type>
   op(ArgsTuple &&args_tuple, output_terminalsT &out) {
     assert(&out == &baseT::get_output_terminals());
-    call_func(std::forward<ArgsTuple>(args_tuple), out, std::make_index_sequence<std::tuple_size_v<ArgsTuple>>{});
+    return call_func(std::forward<ArgsTuple>(args_tuple), out,
+                     std::make_index_sequence<std::tuple_size_v<ArgsTuple>>{});
   };
 
   template <typename Key, typename ArgsTuple = input_refs_tuple_type>
-  std::enable_if_t<ttg::meta::is_empty_tuple_v<ArgsTuple> && !ttg::meta::is_void_v<Key>, void> op(
+  std::enable_if_t<ttg::meta::is_empty_tuple_v<ArgsTuple> && !ttg::meta::is_void_v<Key>, op_return_type> op(
       Key &&key, output_terminalsT &out) {
     assert(&out == &baseT::get_output_terminals());
-    call_func(std::forward<Key>(key), out);
+    return call_func(std::forward<Key>(key), out);
   };
 
   template <typename Key = keyT, typename ArgsTuple = input_refs_tuple_type>
-  std::enable_if_t<ttg::meta::is_empty_tuple_v<ArgsTuple> && ttg::meta::is_void_v<Key>, void> op(
+  std::enable_if_t<ttg::meta::is_empty_tuple_v<ArgsTuple> && ttg::meta::is_void_v<Key>, op_return_type> op(
       output_terminalsT &out) {
     assert(&out == &baseT::get_output_terminals());
-    call_func(out);
+    return call_func(out);
   };
 };
 
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
-          typename input_values_typelistT>
+template <typename funcT, typename returnT, bool funcT_receives_outterm_tuple, ttg::ExecutionSpace space,
+          typename keyT, typename output_terminalsT, typename input_values_typelistT>
 struct CallableWrapTTArgsAsTypelist;
 
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
-          typename... input_valuesT>
-struct CallableWrapTTArgsAsTypelist<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+template <typename funcT, typename returnT, bool funcT_receives_outterm_tuple, ttg::ExecutionSpace space,
+          typename keyT, typename output_terminalsT, typename... input_valuesT>
+struct CallableWrapTTArgsAsTypelist<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT,
                                     std::tuple<input_valuesT...>> {
-  using type = CallableWrapTTArgs<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+  using type = CallableWrapTTArgs<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT,
                                   std::remove_reference_t<input_valuesT>...>;
 };
 
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
-          typename... input_valuesT>
-struct CallableWrapTTArgsAsTypelist<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+template <typename funcT, typename returnT, bool funcT_receives_outterm_tuple, ttg::ExecutionSpace space,
+          typename keyT, typename output_terminalsT, typename... input_valuesT>
+struct CallableWrapTTArgsAsTypelist<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT,
                                     ttg::meta::typelist<input_valuesT...>> {
-  using type = CallableWrapTTArgs<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+  using type = CallableWrapTTArgs<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT,
                                   std::remove_reference_t<input_valuesT>...>;
 };
 
@@ -418,7 +501,9 @@ auto make_tt_tpl(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_value
 /// @warning Although generic arguments annotated by `const auto&` are also permitted, their use is discouraged to avoid confusion;
 ///          namely, `const auto&` denotes a _consumable_ argument, NOT read-only, despite the `const`.
 // clang-format on
-template <typename keyT = void, typename funcT, typename... input_edge_valuesT, typename... output_edgesT>
+template <ttg::ExecutionSpace space,
+          typename keyT = void, typename funcT,
+          typename... input_edge_valuesT, typename... output_edgesT>
 auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>...> &inedges = std::tuple<>{},
              const std::tuple<output_edgesT...> &outedges = std::tuple<>{}, const std::string &name = "wrapper",
              const std::vector<std::string> &innames = std::vector<std::string>(sizeof...(input_edge_valuesT), "input"),
@@ -444,7 +529,10 @@ auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>.
 
   // gross argument typelist for invoking func, can include void for optional args
   constexpr static auto func_is_generic = ttg::meta::is_generic_callable_v<funcT>;
-  using gross_func_args_t = decltype(ttg::meta::compute_arg_binding_types_r<void>(func, candidate_func_args_t{}));
+  using return_type_typelist_and_gross_func_args_t =
+      decltype(ttg::meta::compute_arg_binding_types(func, candidate_func_args_t{}));
+  using func_return_t = std::tuple_element_t<0, std::tuple_element_t<0, return_type_typelist_and_gross_func_args_t>>;
+  using gross_func_args_t = std::tuple_element_t<1, return_type_typelist_and_gross_func_args_t>;
   constexpr auto DETECTED_HOW_TO_INVOKE_GENERIC_FUNC =
       func_is_generic ? !std::is_same_v<gross_func_args_t, ttg::typelist<>> : true;
   static_assert(DETECTED_HOW_TO_INVOKE_GENERIC_FUNC,
@@ -487,12 +575,21 @@ auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>.
   using decayed_input_args_t = ttg::meta::decayed_typelist_t<input_args_t>;
   // 3. full_input_args_t = edge-types with non-void types replaced by input_args_t
   using full_input_args_t = ttg::meta::replace_nonvoid_t<input_edge_value_types, input_args_t>;
-  using wrapT = typename CallableWrapTTArgsAsTypelist<funcT, have_outterm_tuple, keyT, output_terminals_type,
-                                                      full_input_args_t>::type;
+  using wrapT = typename CallableWrapTTArgsAsTypelist<funcT, func_return_t, have_outterm_tuple, space, keyT,
+                                                      output_terminals_type, full_input_args_t>::type;
 
   return std::make_unique<wrapT>(std::forward<funcT>(func), inedges, outedges, name, innames, outnames);
 }
 
+template <typename keyT = void, typename funcT,
+          typename... input_edge_valuesT, typename... output_edgesT>
+auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>...> &inedges = std::tuple<>{},
+             const std::tuple<output_edgesT...> &outedges = std::tuple<>{}, const std::string &name = "wrapper",
+             const std::vector<std::string> &innames = std::vector<std::string>(sizeof...(input_edge_valuesT), "input"),
+             const std::vector<std::string> &outnames = std::vector<std::string>(sizeof...(output_edgesT), "output")) {
+  return make_tt<ttg::ExecutionSpace::Host>(std::forward<funcT>(func), inedges, outedges, name, innames, outnames);
+}
+
 template <typename keyT, typename funcT, typename... input_valuesT, typename... output_edgesT>
 [[deprecated("use make_tt_tpl instead")]] inline auto wrapt(
     funcT &&func, const std::tuple<ttg::Edge<keyT, input_valuesT>...> &inedges,
@@ -511,4 +608,6 @@ template <typename keyT, typename funcT, typename... input_edge_valuesT, typenam
   return make_tt<keyT>(std::forward<funcT>(func), inedges, outedges, name, innames, outnames);
 }
 
+//#include "ttg/make_device_tt.h"
+
 #endif  // TTG_MAKE_TT_H
diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h
new file mode 100644
index 0000000000..8a293738ec
--- /dev/null
+++ b/ttg/ttg/parsec/buffer.h
@@ -0,0 +1,417 @@
+#ifndef TTG_PARSEC_BUFFER_H
+#define TTG_PARSEC_BUFFER_H
+
+// TODO: replace with short vector
+#define TTG_PARSEC_MAX_NUM_DEVICES 4
+
+#include <array>
+#include <vector>
+#include <parsec.h>
+#include <parsec/data_internal.h>
+#include <parsec/mca/device/device.h>
+#include "ttg/parsec/ttg_data_copy.h"
+#include "ttg/util/iovec.h"
+
+namespace ttg_parsec {
+
+
+namespace detail {
+  // fwd decl
+  template<typename T>
+  parsec_data_t* get_parsec_data(const ttg_parsec::buffer<T>& db);
+} // namespace detail
+
+/**
+ * A buffer that is mirrored between host memory
+ * and different devices. The runtime is free to
+ * move data between device and host memory based
+ * on where the tasks are executing.
+ *
+ * Note that a buffer is movable and should not
+ * be shared between two objects (e.g., through a pointer)
+ * in order for TTG to properly facilitate ownership
+ * tracking of the containing object.
+ */
+template<typename T>
+struct buffer {
+
+  using element_type = std::decay_t<T>;
+
+  static_assert(std::is_trivially_copyable_v<element_type>,
+                "Only trivially copyable types are supported for devices.");
+  static_assert(std::is_default_constructible_v<element_type>,
+                "Only default constructible types are supported for devices.");
+
+private:
+  using delete_fn_t = std::add_pointer_t<void(element_type*)>;
+
+  using parsec_data_ptr = std::unique_ptr<parsec_data_t, decltype(&parsec_data_destroy)>;
+  using host_data_ptr   = std::unique_ptr<element_type[], delete_fn_t>;
+  parsec_data_ptr m_data;
+  host_data_ptr m_host_data;
+  std::size_t m_count = 0;
+  detail::ttg_data_copy_t *m_ttg_copy = nullptr;
+
+  static void delete_owned(element_type *ptr) {
+    delete[] ptr;
+  }
+
+  static void delete_non_owned(element_type *ptr) {
+    // nothing to be done, we don't own the memory
+  }
+
+  static void delete_parsec_data(parsec_data_t *data) {
+    std::cout << "delete parsec_data " << data << std::endl;
+    parsec_data_destroy(data);
+  }
+
+  static void delete_null_parsec_data(parsec_data_t *) {
+    // nothing to be done, only used for nullptr
+  }
+
+  void create_host_copy() {
+    /* create a new copy for the host object */
+    parsec_data_copy_t* copy;
+    copy = parsec_data_copy_new(m_data.get(), 0, parsec_datatype_int8_t, PARSEC_DATA_FLAG_PARSEC_MANAGED);
+    copy->device_private = m_host_data.get();
+    copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+    copy->version = 1; // this version is valid
+    m_data->nb_elts = sizeof(element_type)*m_count;
+    m_data->owner_device = 0;
+    /* register the new data with the host copy */
+    if (nullptr != m_ttg_copy) {
+      m_ttg_copy->add_device_data(m_data.get());
+    }
+  }
+
+  void reset() {
+    if (m_data) {
+      if (nullptr != m_ttg_copy) {
+        m_ttg_copy->remove_device_data(m_data.get());
+      }
+      m_data.reset();
+      m_count = 0;
+    }
+  }
+
+  friend parsec_data_t* detail::get_parsec_data<T>(const ttg_parsec::buffer<T>&);
+
+public:
+
+  /* The device ID of the CPU. */
+  static constexpr int cpu_device = 0;
+
+  buffer() : buffer(1)
+  { }
+
+  buffer(std::size_t count)
+  : m_data(parsec_data_new(), &delete_parsec_data)
+  , m_host_data(new element_type[count](), &delete_owned)
+  , m_count(count)
+  , m_ttg_copy(detail::ttg_data_copy_container())
+  {
+    create_host_copy();
+  }
+
+  /* Constructing a buffer using application-managed memory.
+   * The memory pointed to by ptr must be accessible during
+   * the life-time of the buffer. */
+  buffer(element_type* ptr, std::size_t count = 1)
+  : m_data(parsec_data_new(), &parsec_data_destroy)
+  , m_host_data(ptr, &delete_non_owned)
+  , m_count(count)
+  , m_ttg_copy(detail::ttg_data_copy_container())
+  {
+    create_host_copy();
+  }
+
+  ~buffer() {
+    unpin(); // make sure the copies are not pinned
+    /* remove the tracked copy */
+    if (nullptr != m_ttg_copy && m_data) {
+      m_ttg_copy->remove_device_data(m_data.get());
+    }
+  }
+
+  /* allow moving device buffers */
+  buffer(buffer&& db)
+  : m_data(std::move(db.m_data))
+  , m_host_data(std::move(db.m_host_data))
+  , m_count(db.m_count)
+  , m_ttg_copy(db.m_ttg_copy)
+  {
+    db.m_count = 0;
+
+    if (nullptr == m_ttg_copy && nullptr != detail::ttg_data_copy_container()) {
+      m_ttg_copy = detail::ttg_data_copy_container();
+      /* register with the new ttg_copy */
+      m_ttg_copy->add_device_data(m_data.get());
+    }
+  }
+
+  /* explicitly disable copying of buffers
+   * TODO: should we allow this? What data to use?
+   */
+  buffer(const buffer& db) = delete;
+#if 0
+  /* copy the host data but leave the devices untouched */
+  buffer(const buffer& db)
+  : m_data(db.m_count ? parsec_data_new() : nullptr,
+           db.m_count ? &parsec_data_destroy : &delete_null_parsec_data)
+  , m_host_data(db.m_count ? new element_type[db.m_count] : nullptr,
+                db.m_count ? &delete_owned : delete_non_owned)
+  , m_count(db.m_count)
+  , m_ttg_copy(detail::ttg_data_copy_container())
+  {
+    /* copy host data */
+    std::copy(db.m_host_data.get(),
+              db.m_host_data.get() + m_count,
+              m_host_data.get());
+    /* create the host copy with the allocated memory */
+    create_host_copy();
+  }
+#endif // 0
+
+  /* allow moving device buffers */
+  buffer& operator=(buffer&& db) {
+    m_data = std::move(db.m_data);
+    m_host_data = std::move(db.m_host_data);
+    m_count = db.m_count;
+    db.m_count = 0;
+    /* don't update the ttg_copy, we keep the connection */
+  }
+
+  /* explicitly disable copying of buffers
+   * TODO: should we allow this? What data to use?
+   */
+  buffer& operator=(const buffer& db) = delete;
+
+#if 0
+  /* copy the host buffer content but leave the devices untouched */
+  buffer& operator=(const buffer& db) {
+    if (db.m_count == 0) {
+      m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data);
+      m_host_data = host_data_ptr(nullptr, &delete_non_owned);
+    } else {
+      m_data = parsec_data_ptr(parsec_data_new(), &parsec_data_destroy);
+      m_host_data = host_data_ptr(new element_type[db.m_count], &delete_owned);
+      /* copy host data */
+      std::copy(db.m_host_data.get(),
+                db.m_host_data.get() + db.m_count,
+                m_host_data.get());
+      /* create the host copy with the allocated memory */
+      create_host_copy();
+    }
+    m_count = db.m_count;
+  }
+#endif // 0
+
+  /* set the current device, useful when a device
+   * buffer was modified outside of a TTG */
+  void set_current_device(int device_id) {
+    assert(is_valid());
+    /* make sure it's a valid device */
+    assert(parsec_nb_devices > device_id);
+    /* make sure it's a valid copy */
+    assert(m_data->device_copies[device_id] != nullptr);
+    m_data->owner_device = device_id;
+  }
+
+  /* get the current device ID, i.e., the last updated
+   * device buffer.  */
+  int get_current_device() const {
+    assert(is_valid());
+    return m_data->owner_device;
+  }
+
+  /* get the current device pointer */
+  element_type* current_device_ptr() {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  /* get the current device pointer */
+  const element_type* current_device_ptr() const {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  /* get the device pointer at the given device
+   * \sa cpu_device
+   */
+  element_type* device_ptr_on(int device_id) {
+    assert(is_valid());
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
+  }
+
+  /* get the device pointer at the given device
+   * \sa cpu_device
+   */
+  const element_type* device_ptr_on(int device_id) const {
+    assert(is_valid());
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
+  }
+
+  element_type* host_ptr() {
+    return device_ptr_on(cpu_device);
+  }
+
+  const element_type* host_ptr() const {
+    return device_ptr_on(cpu_device);
+  }
+
+  bool is_valid_on(int device_id) const {
+    assert(is_valid());
+    return (parsec_data_get_ptr(m_data.get(), device_id) != nullptr);
+  }
+
+  void allocate_on(int device_id) {
+    /* TODO: need exposed PaRSEC memory allocator */
+  }
+
+  /* TODO: can we do this automatically?
+   * Pin the memory on all devices we currently track.
+   * Pinned memory won't be released by PaRSEC and can be used
+   * at any time.
+   */
+  void pin() {
+    for (int i = 1; i < parsec_nb_devices; ++i) {
+      pin_on(i);
+    }
+  }
+
+  /* Unpin the memory on all devices we currently track. */
+  void unpin() {
+    if (!is_valid()) return;
+    for (int i = 1; i < parsec_nb_devices; ++i) {
+      unpin_on(i);
+    }
+  }
+
+  /* Pin the memory on a given device */
+  void pin_on(int device_id) {
+    /* TODO: how can we pin memory on a device? */
+  }
+
+  /* Pin the memory on a given device */
+  void unpin_on(int device_id) {
+    /* TODO: how can we unpin memory on a device? */
+  }
+
+  bool is_valid() const {
+    return !!m_data;
+  }
+
+  operator bool() const {
+    return is_valid();
+  }
+
+  std::size_t size() const {
+    return m_count;
+  }
+
+  /* Reallocate the buffer with count elements */
+  void reset(std::size_t count) {
+    /* TODO: can we resize if count is smaller than m_count? */
+    /* drop the current data and reallocate */
+    reset();
+    if (count == 0) {
+      m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data);
+      m_host_data = host_data_ptr(nullptr, &delete_non_owned);
+    } else {
+      m_data = parsec_data_ptr(parsec_data_new(), &parsec_data_destroy);
+      m_host_data = host_data_ptr(new element_type[count], &delete_owned);
+      /* create the host copy with the allocated memory */
+      create_host_copy();
+    }
+    m_count = count;
+    /* don't touch the ttg_copy, we still belong to the same container */
+  }
+
+  /* Reset the buffer to use the ptr to count elements */
+  void reset(T* ptr, std::size_t count = 1) {
+    /* TODO: can we resize if count is smaller than m_count? */
+    /* drop the current data and reallocate */
+    reset();
+    if (nullptr == ptr) {
+      m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data);
+      m_host_data = host_data_ptr(nullptr, &delete_non_owned);
+      m_count = 0;
+    } else {
+      m_data = parsec_data_ptr(parsec_data_new(), &parsec_data_destroy);
+      m_host_data = host_data_ptr(ptr, &delete_non_owned);
+      /* create the host copy with the allocated memory */
+      create_host_copy();
+      m_count = count;
+    }
+    /* don't touch the ttg_copy, we still belong to the same container */
+  }
+
+  /* serialization support */
+
+#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
+  template <class Archive>
+  std::enable_if_t<std::is_base_of_v<cereal::detail::InputArchiveBase, Archive> ||
+                    std::is_base_of_v<cereal::detail::OutputArchiveBase, Archive>>
+  serialize(Archive& ar) {
+    if constexpr (ttg::detail::is_output_archive_v<Archive>)
+      std::size_t s = size();
+      assert(m_ttg_copy != nullptr); // only tracked objects allowed
+      m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()});
+      ar(s);
+    else {
+      std::size_t s;
+      ar(s);
+      reset(s);
+      assert(m_ttg_copy != nullptr); // only tracked objects allowed
+      m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()});
+    }
+    ar(value);
+  }
+#endif // TTG_SERIALIZATION_SUPPORTS_CEREAL
+
+#ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS
+  template <typename Archive>
+  std::enable_if_t<std::is_base_of_v<madness::archive::BufferInputArchive, Archive> ||
+                   std::is_base_of_v<madness::archive::BufferOutputArchive, Archive>>
+  serialize(Archive& ar) {
+    if constexpr (ttg::detail::is_output_archive_v<Archive>) {
+      std::size_t s = size();
+      ar& s;
+      assert(m_ttg_copy != nullptr); // only tracked objects allowed
+      m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()});
+    } else {
+      std::size_t s;
+      ar & s;
+      /* initialize internal pointers and then reset */
+      reset(s);
+      assert(m_ttg_copy != nullptr); // only tracked objects allowed
+      m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()});
+    }
+  }
+#endif // TTG_SERIALIZATION_SUPPORTS_MADNESS
+
+
+};
+
+template<typename T>
+struct is_buffer : std::false_type
+{ };
+
+template<typename T>
+struct is_buffer<buffer<T>> : std::true_type
+{ };
+
+template<typename T>
+constexpr static const bool is_buffer_v = is_buffer<T>::value;
+
+namespace detail {
+  template<typename T>
+  parsec_data_t* get_parsec_data(const ttg_parsec::buffer<T>& db) {
+    return const_cast<parsec_data_t*>(db.m_data.get());
+  }
+} // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_BUFFER_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h
new file mode 100644
index 0000000000..f509138f9b
--- /dev/null
+++ b/ttg/ttg/parsec/devicefunc.h
@@ -0,0 +1,192 @@
+#ifndef TTG_PARSEC_DEVICEFUNC_H
+#define TTG_PARSEC_DEVICEFUNC_H
+
+#if defined(TTG_HAVE_CUDART)
+#include <cuda.h>
+#endif
+
+#include "ttg/parsec/task.h"
+#include <parsec/mca/device/device_gpu.h>
+
+#if defined(PARSEC_HAVE_CUDA)
+#include <parsec/mca/device/cuda/device_cuda.h>
+#endif // PARSEC_HAVE_CUDA
+
+namespace ttg_parsec {
+  namespace detail {
+    template<typename... Views, std::size_t I, std::size_t... Is>
+    inline bool register_device_memory(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
+      static_assert(I < MAX_PARAM_COUNT,
+                    "PaRSEC only supports MAX_PARAM_COUNT device input/outputs. "
+                    "Increase MAX_PARAM_COUNT and recompile PaRSEC/TTG.");
+      using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
+      auto& view = std::get<I>(views);
+      bool is_current = false;
+      static_assert(ttg::is_buffer_v<view_type> || ttg_parsec::is_devicescratch_v<view_type>);
+      /* get_parsec_data is overloaded for buffer and devicescratch */
+      parsec_data_t* data = detail::get_parsec_data(view);
+      /* TODO: check whether the device is current */
+
+      auto flow_flags = PARSEC_FLOW_ACCESS_RW;
+      bool pushout = false;
+      if constexpr (std::is_const_v<view_type>) {
+        flow_flags = PARSEC_FLOW_ACCESS_READ;
+      } else if constexpr (ttg_parsec::is_devicescratch_v<view_type>) {
+        switch(view.scope()) {
+          case ttg::scope::Allocate:
+            flow_flags = PARSEC_FLOW_ACCESS_NONE;
+            break;
+          case ttg::scope::SyncIn:
+            flow_flags = PARSEC_FLOW_ACCESS_READ;
+            break;
+        }
+      }
+      assert(nullptr != detail::parsec_ttg_caller->dev_ptr);
+      parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task;
+      parsec_flow_t *flows = detail::parsec_ttg_caller->dev_ptr->flows;
+
+      std::cout << "register_device_memory task " << detail::parsec_ttg_caller << " data " << I << " "
+                << data << " size " << data->nb_elts << std::endl;
+
+      /* build the flow */
+      /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
+      flows[I] = parsec_flow_t{.name = nullptr,
+                               .sym_type = PARSEC_SYM_INOUT,
+                               .flow_flags = static_cast<uint8_t>(flow_flags),
+                               .flow_index = I,
+                               .flow_datatype_mask = ~0 };
+
+      gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
+      gpu_task->flow[I] = &flows[I];
+
+      if (pushout) {
+        std::cout << "PUSHOUT " << I << std::endl;
+        gpu_task->pushout |= 1<<I;
+      }
+
+      /* set the input data copy, parsec will take care of the transfer
+       * and the buffer will look at the parsec_data_t for the current pointer */
+      //detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[data->owner_device];
+      detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[0];
+      detail::parsec_ttg_caller->parsec_task.data[I].source_repo_entry = NULL;
+
+      if constexpr (sizeof...(Is) > 0) {
+        is_current |= register_device_memory(views, std::index_sequence<Is...>{});
+      }
+      return is_current;
+    }
+  } // namespace detail
+
+  /* Takes a tuple of ttg::Views or ttg::buffers and register them
+   * with the currently executing task. Returns true if all memory
+   * is current on the target device, false if transfers are required. */
+  template<typename... Views>
+  inline bool register_device_memory(std::tuple<Views&...> &views) {
+    if (nullptr == detail::parsec_ttg_caller) {
+      throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
+    }
+
+    if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
+      throw std::runtime_error("register_device_memory called inside a non-gpu task!");
+    }
+
+    bool is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
+
+    /* reset all entries in the current task */
+    for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {
+      detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
+      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
+      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
+      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i];
+      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
+    }
+
+    return is_current;
+  }
+
+  namespace detail {
+    template<typename... Views, std::size_t I, std::size_t... Is, bool DeviceAvail = false>
+    inline void mark_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
+
+      using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
+      auto& view = std::get<I>(views);
+
+      /* get_parsec_data is overloaded for buffer and devicescratch */
+      parsec_data_t* data = detail::get_parsec_data(view);
+      /* find the data copy and mark it as pushout */
+      int i = 0;
+      parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task;
+      parsec_gpu_exec_stream_t *stream = detail::parsec_ttg_caller->dev_ptr->stream;
+      /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */
+
+#if defined(TTG_HAVE_CUDART) && defined(PARSEC_HAVE_CUDA)
+      parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)stream;
+      cudaMemcpyAsync(data->device_copies[0]->device_private,
+                      data->device_copies[data->owner_device]->device_private,
+                      data->nb_elts, cudaMemcpyDeviceToHost, cuda_stream->cuda_stream);
+#else
+      static_assert(DeviceAvail, "No device implementation detected!");
+#endif // defined(PARSEC_HAVE_CUDA)
+
+#if 0
+      while (detail::parsec_ttg_caller->parsec_task.data[i].data_in != nullptr) {
+        if (detail::parsec_ttg_caller->parsec_task.data[i].data_in == data->device_copies[0]) {
+          gpu_task->pushout |= 1<<i;
+          break;
+        }
+        ++i;
+      }
+#endif // 0
+      if constexpr (sizeof...(Is) > 0) {
+        // recursion
+        mark_device_out(views, std::index_sequence<Is...>{});
+      }
+    }
+  } // namespace detail
+
+  template<typename... Buffer>
+  inline void mark_device_out(std::tuple<Buffer&...> &b) {
+
+    if (nullptr == detail::parsec_ttg_caller) {
+      throw std::runtime_error("mark_device_out may only be invoked from inside a task!");
+    }
+
+    if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
+      throw std::runtime_error("mark_device_out called inside a non-gpu task!");
+    }
+
+    detail::mark_device_out(b, std::index_sequence_for<Buffer...>{});
+  }
+
+  namespace detail {
+
+    template<typename... Views, std::size_t I, std::size_t... Is>
+    inline void post_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
+
+      using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
+
+      if constexpr (!std::is_const_v<view_type>) {
+        auto& view = std::get<I>(views);
+
+        /* get_parsec_data is overloaded for buffer and devicescratch */
+        parsec_data_t* data = detail::get_parsec_data(view);
+
+        data->device_copies[0]->version++;
+        data->owner_device = 0;
+      }
+
+      if constexpr (sizeof...(Is) > 0) {
+        // recursion
+        post_device_out(views, std::index_sequence<Is...>{});
+      }
+    }
+  } // namespace detail
+  template<typename... Buffer>
+  inline void post_device_out(std::tuple<Buffer&...> &b) {
+    detail::post_device_out(b, std::index_sequence_for<Buffer...>{});
+  }
+
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_DEVICEFUNC_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/devicescratch.h b/ttg/ttg/parsec/devicescratch.h
new file mode 100644
index 0000000000..1c0487cdfb
--- /dev/null
+++ b/ttg/ttg/parsec/devicescratch.h
@@ -0,0 +1,139 @@
+#ifndef TTG_PARSEC_DEVICESCRATCH_H
+#define TTG_PARSEC_DEVICESCRATCH_H
+
+// TODO: replace with short vector
+#define TTG_PARSEC_MAX_NUM_DEVICES 4
+
+#include <array>
+#include <parsec.h>
+#include <parsec/data_internal.h>
+#include <parsec/mca/device/device.h>
+#include <ttg/devicescope.h>
+
+namespace ttg_parsec {
+
+namespace detail {
+  // fwd decl
+  template<typename T>
+  parsec_data_t* get_parsec_data(const ttg_parsec::devicescratch<T>&);
+} // namespace detail
+
+/**
+ * Scratch-space for task-local variables.
+ * TTG will allocate memory on the device
+ * and transfer data in and out based on the scope.
+ */
+template<typename T>
+struct devicescratch {
+
+  using element_type = std::decay_t<T>;
+
+  static_assert(std::is_trivially_copyable_v<element_type>,
+                "Only trivially copyable types are supported for devices.");
+  static_assert(std::is_default_constructible_v<element_type>,
+                "Only default constructible types are supported for devices.");
+
+private:
+
+  parsec_data_t* m_data = nullptr;
+  parsec_data_copy_t m_data_copy;
+  ttg::scope m_scope;
+
+  void create_host_copy(element_type *ptr, std::size_t count) {
+    /* TODO: is the construction call necessary? */
+    /* TODO: handle the scope */
+    PARSEC_OBJ_CONSTRUCT(&m_data_copy, parsec_data_copy_t);
+    m_data_copy.device_index    = 0;
+    //m_data_copy.original        = &m_data;
+    //m_data_copy.older           = NULL;
+    m_data_copy.flags           = PARSEC_DATA_FLAG_PARSEC_MANAGED;
+    m_data_copy.dtt             = parsec_datatype_int8_t;
+    m_data_copy.version         = 1;
+    m_data_copy.device_private  = ptr;
+    m_data_copy.coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+
+    m_data->nb_elts              = count * sizeof(element_type);
+    m_data->owner_device         = 0;
+    parsec_data_copy_attach(m_data, &m_data_copy, 0);
+  }
+
+  friend parsec_data_t* detail::get_parsec_data<T>(const ttg_parsec::devicescratch<T>&);
+
+public:
+
+  /* Constructing a devicescratch using application-managed memory.
+   * The memory pointed to by ptr must be accessible during
+   * the life-time of the devicescratch. */
+  devicescratch(element_type* ptr, ttg::scope scope = ttg::scope::SyncIn, std::size_t count = 1)
+  : m_data(parsec_data_new())
+  , m_scope(scope) {
+    create_host_copy(ptr, count);
+  }
+
+  /* don't allow moving */
+  devicescratch(devicescratch&&) = delete;
+
+  /* don't allow copying */
+  devicescratch(const devicescratch& db) = delete;
+
+  /* don't allow moving */
+  devicescratch& operator=(devicescratch&&) = delete;
+
+  /* don't allow copying */
+  devicescratch& operator=(const devicescratch& db) = delete;
+
+  ~devicescratch() {
+    PARSEC_OBJ_DESTRUCT(&m_data_copy);
+    parsec_data_destroy(m_data);
+    m_data = nullptr;
+  }
+
+  /* get the current device pointer */
+  element_type* device_ptr() {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  /* get the current device pointer */
+  const element_type* device_ptr() const {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  bool is_valid() const {
+    // TODO: how to get the current device
+    // return (m_data->owner_device == parsec_current_device);
+    return true;
+  }
+
+  ttg::scope scope() const {
+    return m_scope;
+  }
+
+  std::size_t size() const {
+    return (m_data->nb_elts / sizeof(element_type));
+  }
+
+};
+
+template<typename T>
+struct is_devicescratch : std::false_type
+{ };
+
+template<typename T>
+struct is_devicescratch<devicescratch<T>> : std::true_type
+{ };
+
+template<typename T>
+constexpr static const bool is_devicescratch_v = is_devicescratch<T>::value;
+
+namespace detail {
+  template<typename T>
+  parsec_data_t* get_parsec_data(const ttg_parsec::devicescratch<T>& scratch) {
+    return const_cast<parsec_data_t*>(scratch.m_data);
+  }
+} // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_DEVICESCRATCH_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h
index 959dbdaac0..53c637136a 100644
--- a/ttg/ttg/parsec/fwd.h
+++ b/ttg/ttg/parsec/fwd.h
@@ -13,6 +13,26 @@ namespace ttg_parsec {
   template <typename keyT, typename output_terminalsT, typename derivedT, typename input_valueTs = ttg::typelist<>>
   class TT;
 
+  template<typename T>
+  struct ptr;
+
+  template<typename T>
+  struct buffer;
+  template<typename T>
+  struct devicescratch;
+
+  template<typename... Views>
+  inline bool register_device_memory(std::tuple<Views&...> &views);
+
+  template<typename... Buffer>
+  inline void mark_device_out(std::tuple<Buffer&...> &b);
+
+  template<typename... Buffer>
+  inline void post_device_out(std::tuple<Buffer&...> &b);
+
+  /* the query of the parsec backend only returns whether the data should be marked for pushout */
+  using query_result_type = bool;
+
   /// \internal the OG name
   template <typename keyT, typename output_terminalsT, typename derivedT, typename... input_valueTs>
   using Op [[deprecated("use TT instead")]] = TT<keyT, output_terminalsT, derivedT, ttg::typelist<input_valueTs...>>;
@@ -26,6 +46,7 @@ namespace ttg_parsec {
 
   inline void ttg_finalize();
 
+  [[noreturn]]
   static inline void ttg_abort();
 
   inline ttg::World ttg_default_execution_context();
@@ -51,6 +72,21 @@ namespace ttg_parsec {
   template <typename T>
   static void ttg_broadcast(ttg::World world, T &data, int source_rank);
 
+  namespace device {
+    class DeviceAllocator;
+    std::size_t nb_devices();
+  }
+
+#if 0
+  template<typename... Args>
+  inline std::pair<bool, std::tuple<ptr<std::decay_t<Args>>...>> get_ptr(Args&&... args);
+#endif
+  template<typename T>
+  inline ptr<std::decay_t<T>> get_ptr(T&& obj);
+
+  template<typename T, typename... Args>
+  inline ptr<T> make_ptr(Args&&... args);
+
 }  // namespace ttg_parsec
 
 #endif  // TTG_PARSEC_FWD_H
diff --git a/ttg/ttg/parsec/ptr.h b/ttg/ttg/parsec/ptr.h
new file mode 100644
index 0000000000..6499e050b7
--- /dev/null
+++ b/ttg/ttg/parsec/ptr.h
@@ -0,0 +1,282 @@
+#ifndef TTG_PARSEC_PTR_H
+#define TTG_PARSEC_PTR_H
+
+#include <unordered_map>
+#include <mutex>
+
+#include "ttg/parsec/ttg_data_copy.h"
+#include "ttg/parsec/thread_local.h"
+#include "ttg/parsec/task.h"
+
+namespace ttg_parsec {
+
+  // fwd decl
+  template<typename T>
+  struct ptr;
+
+  namespace detail {
+    /* fwd decl */
+    template <typename Value>
+    inline ttg_data_copy_t *create_new_datacopy(Value &&value);
+
+    struct ptr {
+      using copy_type = detail::ttg_data_copy_t;
+
+    private:
+      static inline std::unordered_map<ptr*, bool> m_ptr_map;
+      static inline std::mutex m_ptr_map_mtx;
+
+      copy_type *m_copy = nullptr;
+
+      void drop_copy() {
+        std::cout << "ptr drop_copy " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+        if (nullptr != m_copy && 1 == m_copy->drop_ref()) {
+          delete m_copy;
+        }
+        m_copy = nullptr;
+      }
+
+      void register_self() {
+        /* insert ourselves from the list of ptr */
+        std::lock_guard {m_ptr_map_mtx};
+        m_ptr_map.insert(std::pair{this, true});
+      }
+
+      void deregister_self() {
+        /* remove ourselves from the list of ptr */
+        std::lock_guard _{m_ptr_map_mtx};
+        if (m_ptr_map.contains(this)) {
+          m_ptr_map.erase(this);
+        }
+      }
+
+    public:
+      ptr(copy_type *copy)
+      : m_copy(copy)
+      {
+        register_self();
+        m_copy->add_ref();
+        std::cout << "ptr copy_obj ref " << m_copy->num_ref() << std::endl;
+      }
+
+      copy_type* get_copy() const {
+        return m_copy;
+      }
+
+      ptr(const ptr& p)
+      : m_copy(p.m_copy)
+      {
+        register_self();
+        m_copy->add_ref();
+        std::cout << "ptr cpy " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+      }
+
+      ptr(ptr&& p)
+      : m_copy(p.m_copy)
+      {
+        register_self();
+        p.m_copy = nullptr;
+        std::cout << "ptr mov " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+      }
+
+      ~ptr() {
+        deregister_self();
+        drop_copy();
+      }
+
+      ptr& operator=(const ptr& p)
+      {
+        drop_copy();
+        m_copy = p.m_copy;
+        m_copy->add_ref();
+        std::cout << "ptr cpy " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+        return *this;
+      }
+
+      ptr& operator=(ptr&& p) {
+        drop_copy();
+        m_copy = p.m_copy;
+        p.m_copy = nullptr;
+        std::cout << "ptr mov " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+        return *this;
+      }
+
+      bool is_valid() const {
+        return (nullptr != m_copy);
+      }
+
+      void reset() {
+        drop_copy();
+      }
+
+      /* drop all currently registered ptr
+       * \note this function is not thread-safe
+       *       and should only be called at the
+       *       end of the execution, e.g., during finalize.
+       */
+      static void drop_all_ptr() {
+        for(auto it : m_ptr_map) {
+          it.first->drop_copy();
+        }
+      }
+    };
+
+
+    template<typename T>
+    ttg_parsec::detail::ttg_data_copy_t* get_copy(ttg_parsec::ptr<T>& p);
+  } // namespace detail
+
+  template<typename T, typename... Args>
+  ptr<T> ttg_parsec::make_ptr(Args&&... args);
+
+  template<typename T>
+  ptr<std::decay_t<T>> ttg_parsec::get_ptr(T&& obj);
+
+  template<typename T>
+  struct ptr {
+
+    using value_type = std::decay_t<T>;
+
+  private:
+    using copy_type = detail::ttg_data_value_copy_t<value_type>;
+
+    std::unique_ptr<detail::ptr> m_ptr;
+
+    /* only PaRSEC backend functions are allowed to touch our private parts */
+    template<typename... Args>
+    friend ptr<T> make_ptr(Args&&... args);
+    template<typename S>
+    friend ptr<std::decay_t<S>> get_ptr(S&& obj);
+    template<typename S>
+    friend detail::ttg_data_copy_t* detail::get_copy(ptr<S>& p);
+    friend ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC>;
+
+    /* only accessible by get_ptr and make_ptr */
+    ptr(detail::ptr::copy_type *copy)
+    : m_ptr(new detail::ptr(copy))
+    { }
+
+    copy_type* get_copy() const {
+      return static_cast<copy_type*>(m_ptr->get_copy());
+    }
+
+  public:
+
+    ptr() = default;
+
+    ptr(const ptr& p)
+    : ptr(p.get_copy())
+    { }
+
+    ptr(ptr&& p) = default;
+
+    ~ptr() = default;
+
+    ptr& operator=(const ptr& p) {
+      m_ptr.reset(new detail::ptr(p.get_copy()));
+      return *this;
+    }
+
+    ptr& operator=(ptr&& p) = default;
+
+    value_type& operator*() const {
+      return **static_cast<copy_type*>(m_ptr->get_copy());
+    }
+
+    value_type& operator->() const {
+      return **static_cast<copy_type*>(m_ptr->get_copy());
+    }
+
+    bool is_valid() const {
+      return m_ptr && m_ptr->is_valid();
+    }
+
+    void reset() {
+      m_ptr.reset();
+    }
+  };
+
+#if 0
+  namespace detail {
+    template<typename Arg>
+    inline auto get_ptr(Arg&& obj) {
+
+      for (int i = 0; i < detail::parsec_ttg_caller->data_count; ++i) {
+        detail::ttg_data_copy_t *copy = detail::parsec_ttg_caller->copies[i];
+        if (nullptr != copy) {
+          if (copy->get_ptr() == &obj) {
+            bool is_ready = true;
+            /* TODO: how can we force-sync host and device? Current data could be on either. */
+#if 0
+            /* check all tracked device data for validity */
+            for (auto it : copy) {
+              parsec_data_t *data = *it;
+              for (int i = 0; i < parsec_nb_devices; ++i) {
+                if (nullptr != data->device_copies[i]) {
+
+                } else {
+                  is_ready = false;
+                }
+              }
+            }
+#endif // 0
+            return std::make_pair(is_ready, std::tuple{ttg_parsec::ptr<std::decay_t<Arg>>(copy)});
+          }
+        }
+      }
+
+      throw std::runtime_error("ttg::get_ptr called on an unknown object!");
+    }
+  }
+
+  template<typename... Args>
+  inline std::pair<bool, std::tuple<ptr<std::decay_t<Args>>...>> get_ptr(Args&&... args) {
+    if (nullptr == detail::parsec_ttg_caller) {
+      throw std::runtime_error("ttg::get_ptr called outside of a task!");
+    }
+
+    bool ready = true;
+    auto fn = [&](auto&& arg){
+      auto pair = get_ptr(std::forward<decltype(arg)>(arg));
+      ready &= pair.first;
+      return std::move(pair.second);
+    };
+    std::tuple<ptr<std::decay_t<Args>>...> tpl = {(fn(std::forward<Args>(args)))...};
+    return {ready, std::move(tpl)};
+  }
+#endif // 0
+
+  template<typename T>
+  inline ptr<std::decay_t<T>> get_ptr(T&& obj) {
+    using ptr_type = ptr<std::decay_t<T>>;
+    if (nullptr != detail::parsec_ttg_caller) {
+      for (int i = 0; i < detail::parsec_ttg_caller->data_count; ++i) {
+        detail::ttg_data_copy_t *copy = detail::parsec_ttg_caller->copies[i];
+        if (nullptr != copy) {
+          if (copy->get_ptr() == &obj) {
+            return ptr_type(copy);
+          }
+        }
+      }
+    }
+    /* object not tracked, make a new ptr that is now tracked */
+    detail::ttg_data_copy_t *copy = detail::create_new_datacopy(obj);
+    return ptr_type(copy);
+  }
+
+  template<typename T, typename... Args>
+  inline ptr<T> make_ptr(Args&&... args) {
+    detail::ttg_data_copy_t *copy = detail::create_new_datacopy(T(std::forward<Args>(args)...));
+    return ptr<T>(copy);
+  }
+
+  namespace detail {
+    template<typename T>
+    detail::ttg_data_copy_t* get_copy(ttg_parsec::ptr<T>& p) {
+      return p.get_copy();
+    }
+  } // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_PTR_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
new file mode 100644
index 0000000000..3636e903a5
--- /dev/null
+++ b/ttg/ttg/parsec/task.h
@@ -0,0 +1,210 @@
+#ifndef TTG_PARSEC_TASK_H
+#define TTG_PARSEC_TASK_H
+
+#include "ttg/parsec/ttg_data_copy.h"
+
+#include <parsec/parsec_internal.h>
+#include <parsec/mca/device/device_gpu.h>
+
+namespace ttg_parsec {
+
+  namespace detail {
+
+    struct device_ptr_t {
+      parsec_gpu_task_t* gpu_task = nullptr;
+      parsec_flow_t* flows = nullptr;
+      parsec_gpu_exec_stream_t* stream = nullptr;
+    };
+
+    template<bool SupportDevice>
+    struct device_state_t
+    {
+      static constexpr bool support_device = false;
+      static constexpr size_t num_flows = 0;
+      device_state_t()
+      { }
+      static constexpr device_ptr_t* dev_ptr() {
+        return nullptr;
+      }
+    };
+
+    template<>
+    struct device_state_t<true> {
+      static constexpr bool support_device = false;
+      static constexpr size_t num_flows = MAX_PARAM_COUNT;
+      parsec_flow_t m_flows[num_flows];
+      device_ptr_t m_dev_ptr = {nullptr, &m_flows[0], nullptr}; // gpu_task will be allocated in each task
+      device_ptr_t* dev_ptr() {
+        return &m_dev_ptr;
+      }
+    };
+
+    typedef parsec_hook_return_t (*parsec_static_op_t)(void *);  // static_op will be cast to this type
+
+    struct parsec_ttg_task_base_t {
+      parsec_task_t parsec_task;
+      int32_t in_data_count = 0;   //< number of satisfied inputs
+      int32_t data_count = 0;      //< number of data elements in the copies array
+      ttg_data_copy_t **copies;    //< pointer to the fixed copies array of the derived task
+      parsec_hash_table_item_t tt_ht_item = {};
+      parsec_static_op_t function_template_class_ptr[ttg::runtime_traits<ttg::Runtime::PaRSEC>::num_execution_spaces] =
+          {nullptr};
+
+      typedef struct {
+        std::size_t goal;
+        std::size_t size;
+      } size_goal_t;
+
+      typedef void (release_task_fn)(parsec_ttg_task_base_t*);
+      /* Poor-mans virtual function
+       * We cannot use virtual inheritance or private visibility because we
+       * need offsetof for the mempool and scheduling.
+       */
+      release_task_fn* release_task_cb = nullptr;
+      device_ptr_t* dev_ptr;
+      bool remove_from_hash = true;
+      bool is_dummy = false;
+      bool defer_writer = TTG_PARSEC_DEFER_WRITER; // whether to defer writer instead of creating a new copy
+
+
+      /*
+      virtual void release_task() = 0;
+      */
+    //public:
+      void release_task() {
+        release_task_cb(this);
+      }
+
+     protected:
+      /**
+       * Protected constructors: this class should not be instantiated directly
+       * but always be use through parsec_ttg_task_t.
+       */
+
+      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
+                             int data_count, ttg_data_copy_t **copies, device_ptr_t *dev_ptr,
+                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
+          : data_count(data_count)
+          , copies(copies)
+          , dev_ptr(dev_ptr)
+          , defer_writer(defer_writer) {
+        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
+        parsec_task.mempool_owner = mempool;
+        parsec_task.task_class = task_class;
+        parsec_task.priority = 0;
+      }
+
+      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
+                             parsec_taskpool_t *taskpool, int32_t priority,
+                             int data_count, ttg_data_copy_t **copies, device_ptr_t *dev_ptr,
+                             release_task_fn *release_fn,
+                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
+          : data_count(data_count)
+          , copies(copies)
+          , release_task_cb(release_fn)
+          , dev_ptr(dev_ptr)
+          , defer_writer(defer_writer) {
+        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
+        parsec_task.mempool_owner = mempool;
+        parsec_task.task_class = task_class;
+        parsec_task.status = PARSEC_TASK_STATUS_HOOK;
+        parsec_task.taskpool = taskpool;
+        parsec_task.priority = priority;
+        parsec_task.chore_mask = 1<<0;
+      }
+
+    public:
+      void set_dummy(bool d) { is_dummy = d; }
+      bool dummy() { return is_dummy; }
+    };
+
+    template <typename TT, bool KeyIsVoid = ttg::meta::is_void_v<typename TT::key_type>>
+    struct parsec_ttg_task_t : public parsec_ttg_task_base_t {
+      using key_type = typename TT::key_type;
+      static constexpr size_t num_streams = TT::numins;
+      /* device tasks may have to store more copies than it's inputs as their sends are aggregated */
+      static constexpr size_t num_copies  = TT::derived_has_cuda_op() ? static_cast<size_t>(MAX_PARAM_COUNT)
+                                                                      : (num_streams+1);
+      TT* tt;
+      key_type key;
+      size_goal_t stream[num_streams] = {};
+#ifdef TTG_HAS_COROUTINE
+      void* suspended_task_address = nullptr;  // if not null the function is suspended
+#endif
+      ttg_data_copy_t *copies[num_copies] = { nullptr };  // the data copies tracked by this task
+      device_state_t<TT::derived_has_cuda_op()> dev_state;
+
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
+          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies, dev_state.dev_ptr()) {
+        tt_ht_item.key = pkey();
+
+        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
+        *(uintptr_t*)&(parsec_task.locals[0]) = 0; //there is no key
+        *(uintptr_t*)&(parsec_task.locals[2]) = 0; //there is no key
+      }
+
+      parsec_ttg_task_t(const key_type& key, parsec_thread_mempool_t *mempool,
+                        parsec_task_class_t *task_class, parsec_taskpool_t *taskpool,
+                        TT *tt_ptr, int32_t priority)
+          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
+                                   num_streams, copies, dev_state.dev_ptr(),
+                                   &release_task, tt_ptr->m_defer_writer)
+          , tt(tt_ptr), key(key) {
+        tt_ht_item.key = pkey();
+
+        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
+        uint64_t hv = ttg::hash<std::decay_t<decltype(key)>>{}(key);
+        *(uintptr_t*)&(parsec_task.locals[0]) = hv;
+        *(uintptr_t*)&(parsec_task.locals[2]) = reinterpret_cast<uintptr_t>(&this->key);
+      }
+
+      static void release_task(parsec_ttg_task_base_t* task_base) {
+        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
+        TT *tt = task->tt;
+        tt->release_task(task);
+      }
+
+      parsec_key_t pkey() { return reinterpret_cast<parsec_key_t>(&key); }
+    };
+
+    template <typename TT>
+    struct parsec_ttg_task_t<TT, true> : public parsec_ttg_task_base_t {
+      static constexpr size_t num_streams = TT::numins;
+      TT* tt;
+      size_goal_t stream[num_streams] = {};
+#ifdef TTG_HAS_COROUTINE
+      void* suspended_task_address = nullptr;  // if not null the function is suspended
+#endif
+      ttg_data_copy_t *copies[num_streams+1] = { nullptr };  // the data copies tracked by this task
+                                                             // +1 for the copy needed during send/bcast
+      device_state_t<TT::derived_has_cuda_op()> dev_state;
+
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
+          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies, dev_state.dev_ptr()) {
+        tt_ht_item.key = pkey();
+      }
+
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
+                        parsec_taskpool_t *taskpool, TT *tt_ptr, int32_t priority)
+          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
+                                   num_streams, copies, dev_state.dev_ptr(),
+                                   &release_task, tt_ptr->m_defer_writer)
+          , tt(tt_ptr) {
+        tt_ht_item.key = pkey();
+      }
+
+      static void release_task(parsec_ttg_task_base_t* task_base) {
+        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
+        TT *tt = task->tt;
+        tt->release_task(task);
+      }
+
+      parsec_key_t pkey() { return 0; }
+    };
+
+
+  } // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_TASK_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/thread_local.h b/ttg/ttg/parsec/thread_local.h
new file mode 100644
index 0000000000..54b98885ec
--- /dev/null
+++ b/ttg/ttg/parsec/thread_local.h
@@ -0,0 +1,22 @@
+#ifndef TTG_PARSEC_THREAD_LOCAL_H
+#define TTG_PARSEC_THREAD_LOCAL_H
+
+namespace ttg_parsec {
+
+namespace detail {
+
+  // fwd decls
+  struct parsec_ttg_task_base_t;
+  struct ttg_data_copy_t;
+
+  inline thread_local parsec_ttg_task_base_t *parsec_ttg_caller = nullptr;
+
+  inline ttg_data_copy_t*& ttg_data_copy_container() {
+    static thread_local ttg_data_copy_t *ptr = nullptr;
+    return ptr;
+  }
+
+} // namespace detail
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_THREAD_LOCAL_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index b0d8a91865..256341dba7 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -7,6 +7,11 @@
 #define TTG_USE_PARSEC 1
 #endif  // !defined(TTG_IMPL_NAME)
 
+/* Whether to defer a potential writer if there are readers.
+ * This may avoid extra copies in exchange for concurrency.
+ * This may cause deadlocks, so use with caution. */
+#define TTG_PARSEC_DEFER_WRITER false
+
 #include "ttg/impl_selector.h"
 
 /* include ttg header to make symbols available in case this header is included directly */
@@ -31,8 +36,15 @@
 
 #include "ttg/serialization/data_descriptor.h"
 
+#include "ttg/view.h"
+
 #include "ttg/parsec/fwd.h"
 
+#include "ttg/parsec/buffer.h"
+#include "ttg/parsec/devicescratch.h"
+#include "ttg/parsec/thread_local.h"
+#include "ttg/parsec/devicefunc.h"
+
 #include <algorithm>
 #include <array>
 #include <cassert>
@@ -50,15 +62,30 @@
 #include <tuple>
 #include <vector>
 
+// needed for MPIX_CUDA_AWARE_SUPPORT
+#include <mpi.h>
+#include <mpi-ext.h>
+
+
+/* TODO: remove once we use PaRSEC master */
+#ifndef PARSEC_HAVE_CUDA
+#define PARSEC_HAVE_CUDA 1
+#endif // PARSEC_HAVE_CUDA
+
 #include <parsec.h>
 #include <parsec/class/parsec_hash_table.h>
 #include <parsec/data_internal.h>
 #include <parsec/execution_stream.h>
 #include <parsec/interfaces/interface.h>
 #include <parsec/mca/device/device.h>
+#include <parsec/utils/zone_malloc.h>
 #include <parsec/parsec_comm_engine.h>
 #include <parsec/parsec_internal.h>
 #include <parsec/scheduling.h>
+#include <parsec/remote_dep.h>
+/* TODO: once we use parsec master we need to switch this include */
+#include <parsec/mca/device/cuda/device_cuda.h>
+#include <parsec/mca/device/device_gpu.h>
 #if defined(PARSEC_PROF_TRACE)
 #include <parsec/profiling.h>
 #undef PARSEC_TTG_PROFILE_BACKEND
@@ -70,6 +97,9 @@
 #include <cstring>
 
 #include "ttg/parsec/ttg_data_copy.h"
+#include "ttg/parsec/thread_local.h"
+#include "ttg/parsec/ptr.h"
+#include "ttg/parsec/task.h"
 
 #undef TTG_PARSEC_DEBUG_TRACK_DATA_COPIES
 
@@ -77,17 +107,14 @@
 #include <unordered_set>
 #endif
 
-/* Whether to defer a potential writer if there are readers.
- * This may avoid extra copies in exchange for concurrency.
- * This may cause deadlocks, so use with caution. */
-#define TTG_PARSEC_DEFER_WRITER false
-
 /* PaRSEC function declarations */
 extern "C" {
 void parsec_taskpool_termination_detected(parsec_taskpool_t *tp);
 int parsec_add_fetch_runtime_task(parsec_taskpool_t *tp, int tasks);
 }
 
+#include "ttg/view.h"
+
 namespace ttg_parsec {
   inline thread_local parsec_execution_stream_t *parsec_ttg_es;
 
@@ -99,16 +126,28 @@ namespace ttg_parsec {
   inline std::multimap<uint64_t, static_set_arg_fct_arg_t> delayed_unpack_actions;
 
   struct msg_header_t {
-    typedef enum {
+    typedef enum fn_id : std::int8_t {
       MSG_SET_ARG = 0,
       MSG_SET_ARGSTREAM_SIZE = 1,
       MSG_FINALIZE_ARGSTREAM_SIZE = 2,
-      MSG_GET_FROM_PULL =3 } fn_id_t;
+      MSG_GET_FROM_PULL = 3 } fn_id_t;
     uint32_t taskpool_id;
     uint64_t op_id;
+    std::size_t key_offset = 0;
     fn_id_t fn_id;
+    std::int8_t num_iovecs = 0;
     int32_t param_id;
-    int num_keys;
+    int num_keys = 0;
+    int sender;
+
+    msg_header_t(fn_id_t fid, uint32_t tid, uint64_t oid, int32_t pid, int sender, int nk)
+    : fn_id(fid)
+    , taskpool_id(tid)
+    , op_id(oid)
+    , param_id(pid)
+    , num_keys(nk)
+    , sender(sender)
+    { }
   };
 
   static void unregister_parsec_tags(void *_);
@@ -167,8 +206,7 @@ namespace ttg_parsec {
   }  // namespace detail
 
   class WorldImpl : public ttg::base::WorldImplBase {
-    static constexpr const int _PARSEC_TTG_TAG = 10;      // This TAG should be 'allocated' at the PaRSEC level
-    static constexpr const int _PARSEC_TTG_RMA_TAG = 11;  // This TAG should be 'allocated' at the PaRSEC level
+    int32_t parsec_comm_engine_cb_idx;
 
     ttg::Edge<> m_ctl_edge;
     bool _dag_profiling;
@@ -186,6 +224,18 @@ namespace ttg_parsec {
       return comm_rank;
     }
 
+    static void ttg_parsec_ce_up(parsec_comm_engine_t *comm_engine, void *user_data)
+    {
+      parsec_ce.tag_register(WorldImpl::parsec_ttg_tag(), &detail::static_unpack_msg, user_data, PARSEC_TTG_MAX_AM_SIZE);
+      parsec_ce.tag_register(WorldImpl::parsec_ttg_rma_tag(), &detail::get_remote_complete_cb, user_data, 128);
+    }
+
+    static void ttg_parsec_ce_down(parsec_comm_engine_t *comm_engine, void *user_data)
+    {
+      parsec_ce.tag_unregister(WorldImpl::parsec_ttg_tag());
+      parsec_ce.tag_unregister(WorldImpl::parsec_ttg_rma_tag());
+    }
+
    public:
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
     int parsec_ttg_profile_backend_set_arg_start, parsec_ttg_profile_backend_set_arg_end;
@@ -193,7 +243,7 @@ namespace ttg_parsec {
     int parsec_ttg_profile_backend_allocate_datacopy, parsec_ttg_profile_backend_free_datacopy;
 #endif
 
-    static constexpr const int PARSEC_TTG_MAX_AM_SIZE = 1024 * 1024;
+    static constexpr const int PARSEC_TTG_MAX_AM_SIZE = 4 * 1024;
     WorldImpl(int *argc, char **argv[], int ncores, parsec_context_t *c = nullptr)
         : WorldImplBase(query_comm_size(), query_comm_rank())
         , ctx(c)
@@ -228,21 +278,27 @@ namespace ttg_parsec {
 
       es = ctx->virtual_processes[0]->execution_streams[0];
 
-      parsec_ce.tag_register(_PARSEC_TTG_TAG, &detail::static_unpack_msg, this, PARSEC_TTG_MAX_AM_SIZE);
-      parsec_ce.tag_register(_PARSEC_TTG_RMA_TAG, &detail::get_remote_complete_cb, this, 128);
+      parsec_comm_engine_cb_idx = parsec_comm_engine_register_callback(ttg_parsec_ce_up, this, ttg_parsec_ce_down, this);
 
       create_tpool();
     }
 
     void create_tpool() {
       assert(nullptr == tpool);
-      tpool = (parsec_taskpool_t *)calloc(1, sizeof(parsec_taskpool_t));
+      tpool = PARSEC_OBJ_NEW(parsec_taskpool_t);
       tpool->taskpool_id = -1;
       tpool->update_nb_runtime_task = parsec_add_fetch_runtime_task;
       tpool->taskpool_type = PARSEC_TASKPOOL_TYPE_TTG;
-      tpool->taskpool_name = (char*)"TTG Taskpool";
+      tpool->taskpool_name = strdup("TTG Taskpool");
       parsec_taskpool_reserve_id(tpool);
 
+      tpool->devices_index_mask = 0;
+      for(int i = 0; i < (int)parsec_nb_devices; i++) {
+          parsec_device_module_t *device = parsec_mca_device_get(i);
+          if( NULL == device ) continue;
+          tpool->devices_index_mask |= (1 << device->device_index);
+      }
+
 #ifdef TTG_USE_USER_TERMDET
       parsec_termdet_open_module(tpool, "user_trigger");
 #else   // TTG_USE_USER_TERMDET
@@ -254,7 +310,7 @@ namespace ttg_parsec {
       // be added by the main thread. It should then be initialized
       // to 0, execute will set it to 1 and mark the tpool as ready,
       // and the fence() will decrease it back to 0.
-      tpool->tdm.module->taskpool_set_nb_pa(tpool, 0);
+      tpool->tdm.module->taskpool_set_runtime_actions(tpool, 0);
       parsec_taskpool_enable(tpool, NULL, NULL, es, size() > 1);
 
 #if defined(PARSEC_PROF_TRACE)
@@ -284,15 +340,15 @@ namespace ttg_parsec {
 
     ~WorldImpl() { destroy(); }
 
-    static constexpr int parsec_ttg_tag() { return _PARSEC_TTG_TAG; }
-    static constexpr int parsec_ttg_rma_tag() { return _PARSEC_TTG_RMA_TAG; }
+    static constexpr int parsec_ttg_tag() { return PARSEC_DSL_TTG_TAG; }
+    static constexpr int parsec_ttg_rma_tag() { return PARSEC_DSL_TTG_RMA_TAG; }
 
     MPI_Comm comm() const { return MPI_COMM_WORLD; }
 
     virtual void execute() override {
       if (!parsec_taskpool_started) {
         parsec_enqueue(ctx, tpool);
-        tpool->tdm.module->taskpool_addto_nb_pa(tpool, 1);
+        tpool->tdm.module->taskpool_addto_runtime_actions(tpool, 1);
         tpool->tdm.module->taskpool_ready(tpool);
         [[maybe_unused]] auto ret = parsec_context_start(ctx);
         // ignore ret since all of its nonzero values are OK (e.g. -1 due to ctx already being active)
@@ -317,7 +373,7 @@ namespace ttg_parsec {
       if (is_valid()) {
         if (parsec_taskpool_started) {
           // We are locally ready (i.e. we won't add new tasks)
-          tpool->tdm.module->taskpool_addto_nb_pa(tpool, -1);
+          tpool->tdm.module->taskpool_addto_runtime_actions(tpool, -1);
           ttg::trace("ttg_parsec(", this->rank(), "): final waiting for completion");
           if (own_ctx)
             parsec_context_wait(ctx);
@@ -328,9 +384,9 @@ namespace ttg_parsec {
         ttg::detail::deregister_world(*this);
         destroy_tpool();
         if (own_ctx) {
-          unregister_parsec_tags(nullptr);
+          unregister_parsec_tags(&parsec_comm_engine_cb_idx);
         } else {
-          parsec_context_at_fini(unregister_parsec_tags, nullptr);
+          parsec_context_at_fini(unregister_parsec_tags, &parsec_comm_engine_cb_idx);
         }
 #if defined(PARSEC_PROF_TRACE)
         if(nullptr != profiling_array) {
@@ -354,8 +410,8 @@ namespace ttg_parsec {
 
     void increment_created() { taskpool()->tdm.module->taskpool_addto_nb_tasks(taskpool(), 1); }
 
-    void increment_inflight_msg() { taskpool()->tdm.module->taskpool_addto_nb_pa(taskpool(), 1); }
-    void decrement_inflight_msg() { taskpool()->tdm.module->taskpool_addto_nb_pa(taskpool(), -1); }
+    void increment_inflight_msg() { taskpool()->tdm.module->taskpool_addto_runtime_actions(taskpool(), 1); }
+    void decrement_inflight_msg() { taskpool()->tdm.module->taskpool_addto_runtime_actions(taskpool(), -1); }
 
     bool dag_profiling() override { return _dag_profiling; }
 
@@ -438,7 +494,10 @@ namespace ttg_parsec {
 
       assert(0 == tpool->profiling_array[2*position]);
       assert(0 == tpool->profiling_array[2*position+1]);
-      parsec_profiling_add_dictionary_keyword(name, "fill:000000", 0, NULL,
+      // TODO PROFILING: 0 and NULL should be replaced with something that depends on the key human-readable serialization...
+      // Typically, we would put something like 3*sizeof(int32_t), "m{int32_t};n{int32_t};k{int32_t}" to say
+      // there are three fields, named m, n and k, stored in this order, and each of size int32_t
+      parsec_profiling_add_dictionary_keyword(name, "fill:000000", 64, "key{char[64]}",
                                               (int*)&tpool->profiling_array[2*position],
                                               (int*)&tpool->profiling_array[2*position+1]);
     }
@@ -453,7 +512,7 @@ namespace ttg_parsec {
       }
       ttg::trace("ttg_parsec::(", rank, "): parsec taskpool is ready for completion");
       // We are locally ready (i.e. we won't add new tasks)
-      tpool->tdm.module->taskpool_addto_nb_pa(tpool, -1);
+      tpool->tdm.module->taskpool_addto_runtime_actions(tpool, -1);
       ttg::trace("ttg_parsec(", rank, "): waiting for completion");
       parsec_taskpool_wait(tpool);
 
@@ -479,16 +538,15 @@ namespace ttg_parsec {
 #endif
   };
 
-  static void unregister_parsec_tags(void *_)
+  static void unregister_parsec_tags(void *_pidx)
   {
-    parsec_ce.tag_unregister(WorldImpl::parsec_ttg_tag());
-    parsec_ce.tag_unregister(WorldImpl::parsec_ttg_rma_tag());
+    int32_t *pidx = static_cast<int32_t*>(_pidx);
+    parsec_comm_engine_unregister_callback(*pidx);
+    *pidx = 0;
   }
 
   namespace detail {
 
-    typedef void (*parsec_static_op_t)(void *);  // static_op will be cast to this type
-
     const parsec_symbol_t parsec_taskclass_param0 = {
       .flags = PARSEC_SYMBOL_IS_STANDALONE|PARSEC_SYMBOL_IS_GLOBAL,
       .name = "HASH0",
@@ -522,167 +580,14 @@ namespace ttg_parsec {
       .expr_inc = nullptr,
       .cst_inc = 0 };
 
-    struct parsec_ttg_task_base_t {
-      parsec_task_t parsec_task;
-      int32_t in_data_count = 0;  //< number of satisfied inputs
-      int32_t data_count = 0;     //< number of data elements in parsec_task.data
-      parsec_hash_table_item_t tt_ht_item = {};
-      parsec_static_op_t function_template_class_ptr[ttg::runtime_traits<ttg::Runtime::PaRSEC>::num_execution_spaces] =
-          {nullptr};
-      bool is_dummy = false;
-      bool defer_writer = TTG_PARSEC_DEFER_WRITER; // whether to defer writer instead of creating a new copy
-
-      typedef void (release_task_fn)(parsec_ttg_task_base_t*);
-
-      typedef struct {
-        std::size_t goal;
-        std::size_t size;
-      } size_goal_t;
-
-      /* Poor-mans virtual function
-       * We cannot use virtual inheritance or private visibility because we
-       * need offsetof for the mempool and scheduling.
-       */
-      release_task_fn* release_task_cb = nullptr;
-      bool remove_from_hash = true;
-
-      /*
-      virtual void release_task() = 0;
-      */
-    //public:
-      void release_task() {
-        release_task_cb(this);
-      }
-
-     protected:
-      /**
-       * Protected constructors: this class should not be instantiated directly
-       * but always be use through parsec_ttg_task_t.
-       */
-
-      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, int data_count,
-                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
-          : data_count(data_count), defer_writer(defer_writer) {
-        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
-        parsec_task.mempool_owner = mempool;
-        parsec_task.task_class = task_class;
-        parsec_task.priority = 0;
-      }
-
-      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
-                             parsec_taskpool_t *taskpool, int32_t priority, int data_count,
-                             release_task_fn *release_fn,
-                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
-          : data_count(data_count)
-          , defer_writer(defer_writer)
-          , release_task_cb(release_fn) {
-            int32_t p = priority;
-        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
-        parsec_task.mempool_owner = mempool;
-        parsec_task.task_class = task_class;
-        parsec_task.status = PARSEC_TASK_STATUS_HOOK;
-        parsec_task.taskpool = taskpool;
-        parsec_task.priority = priority;
-        parsec_task.chore_id = 0;
-      }
-
-    public:
-      void set_dummy(bool d) { is_dummy = d; }
-      bool dummy() { return is_dummy; }
-    };
-
-    template <typename TT, bool KeyIsVoid = ttg::meta::is_void_v<typename TT::key_type>>
-    struct parsec_ttg_task_t : public parsec_ttg_task_base_t {
-      using key_type = typename TT::key_type;
-      static constexpr size_t num_streams = TT::numins;
-      TT* tt;
-      key_type key;
-      size_goal_t stream[num_streams] = {};
-
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
-          : parsec_ttg_task_base_t(mempool, task_class, num_streams) {
-        tt_ht_item.key = pkey();
-
-        for (int i = 0; i < num_streams; ++i) {
-          parsec_task.data[i].data_in = nullptr;
-        }
-
-        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
-        *(uintptr_t*)&(parsec_task.locals[0]) = 0; //there is no key
-        *(uintptr_t*)&(parsec_task.locals[2]) = 0; //there is no key
-      }
-
-      parsec_ttg_task_t(const key_type& key, parsec_thread_mempool_t *mempool,
-                        parsec_task_class_t *task_class, parsec_taskpool_t *taskpool,
-                        TT *tt_ptr, int32_t priority)
-          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
-                                   num_streams, &release_task, tt_ptr->m_defer_writer)
-          , tt(tt_ptr), key(key) {
-        tt_ht_item.key = pkey();
-
-        for (int i = 0; i < num_streams; ++i) {
-          parsec_task.data[i].data_in = nullptr;
-        }
-
-        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
-        uint64_t hv = ttg::hash<std::decay_t<decltype(key)>>{}(key);
-        *(uintptr_t*)&(parsec_task.locals[0]) = hv;
-        *(uintptr_t*)&(parsec_task.locals[2]) = reinterpret_cast<uintptr_t>(&this->key);
-      }
-
-      static void release_task(parsec_ttg_task_base_t* task_base) {
-        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
-        TT *tt = task->tt;
-        tt->release_task(task);
-      }
-
-      parsec_key_t pkey() { return reinterpret_cast<parsec_key_t>(&key); }
-    };
-
-    template <typename TT>
-    struct parsec_ttg_task_t<TT, true> : public parsec_ttg_task_base_t {
-      static constexpr size_t num_streams = TT::numins;
-      TT* tt;
-      size_goal_t stream[num_streams] = {};
-
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
-          : parsec_ttg_task_base_t(mempool, task_class, num_streams) {
-        tt_ht_item.key = pkey();
-
-        for (int i = 0; i < num_streams; ++i) {
-          parsec_task.data[i].data_in = nullptr;
-        }
-      }
-
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
-                        parsec_taskpool_t *taskpool, TT *tt_ptr, int32_t priority)
-          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
-                                   num_streams, &release_task, tt_ptr->m_defer_writer)
-          , tt(tt_ptr) {
-        tt_ht_item.key = pkey();
-
-        for (int i = 0; i < num_streams; ++i) {
-          parsec_task.data[i].data_in = nullptr;
-        }
-      }
-
-      static void release_task(parsec_ttg_task_base_t* task_base) {
-        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
-        TT *tt = task->tt;
-        tt->release_task(task);
-      }
-
-      parsec_key_t pkey() { return 0; }
-    };
-
     inline ttg_data_copy_t *find_copy_in_task(parsec_ttg_task_base_t *task, const void *ptr) {
       ttg_data_copy_t *res = nullptr;
       if (task == nullptr || ptr == nullptr) {
         return res;
       }
       for (int i = 0; i < task->data_count; ++i) {
-        auto copy = static_cast<ttg_data_copy_t *>(task->parsec_task.data[i].data_in);
-        if (NULL != copy && copy->device_private == ptr) {
+        auto copy = static_cast<ttg_data_copy_t *>(task->copies[i]);
+        if (NULL != copy && copy->get_ptr() == ptr) {
           res = copy;
           break;
         }
@@ -696,8 +601,8 @@ namespace ttg_parsec {
         return i;
       }
       for (i = 0; i < task->data_count; ++i) {
-        auto copy = static_cast<ttg_data_copy_t *>(task->parsec_task.data[i].data_in);
-        if (NULL != copy && copy->device_private == ptr) {
+        auto copy = static_cast<ttg_data_copy_t *>(task->copies[i]);
+        if (NULL != copy && copy->get_ptr() == ptr) {
           return i;
         }
       }
@@ -713,7 +618,7 @@ namespace ttg_parsec {
         throw std::logic_error("Too many data copies, check MAX_PARAM_COUNT!");
       }
 
-      task->parsec_task.data[task->data_count].data_in = copy;
+      task->copies[task->data_count] = copy;
       task->data_count++;
       return true;
     }
@@ -722,17 +627,17 @@ namespace ttg_parsec {
       int i;
       /* find and remove entry; copies are usually appended and removed, so start from back */
       for (i = task->data_count-1; i >= 0; --i) {
-        if (copy == task->parsec_task.data[i].data_in) {
+        if (copy == task->copies[i]) {
           break;
         }
       }
       if (i < 0) return;
       /* move all following elements one up */
       for (; i < task->data_count - 1; ++i) {
-        task->parsec_task.data[i].data_in = task->parsec_task.data[i + 1].data_in;
+        task->copies[i] = task->copies[i + 1];
       }
       /* null last element */
-      task->parsec_task.data[i].data_in = nullptr;
+      task->copies[i] = nullptr;
       task->data_count--;
     }
 
@@ -748,7 +653,13 @@ namespace ttg_parsec {
     template <typename Value>
     inline ttg_data_copy_t *create_new_datacopy(Value &&value) {
       using value_type = std::decay_t<Value>;
-      ttg_data_copy_t *copy = new ttg_data_value_copy_t<value_type>(std::forward<Value>(value));
+      ttg_data_copy_t *copy;
+      if constexpr (std::is_rvalue_reference_v<decltype(value)> ||
+                    std::is_copy_constructible_v<std::decay_t<Value>>) {
+        copy = new ttg_data_value_copy_t<value_type>(std::forward<Value>(value));
+      } else {
+        throw std::logic_error("Trying to copy-construct data that is not copy-constructible!");
+      }
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
       // Keep track of additional memory usage
       if(ttg::default_execution_context().impl().profiling()) {
@@ -782,10 +693,11 @@ namespace ttg_parsec {
     inline parsec_hook_return_t hook_cuda(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) {
       parsec_execution_stream_t *safe_es = parsec_ttg_es;
       parsec_ttg_es = es;
+      std::cout << "hook_cuda task " << parsec_task << std::endl;
       parsec_ttg_task_base_t *me = (parsec_ttg_task_base_t *)parsec_task;
-      me->function_template_class_ptr[static_cast<std::size_t>(ttg::ExecutionSpace::CUDA)](parsec_task);
+      auto ret = me->function_template_class_ptr[static_cast<std::size_t>(ttg::ExecutionSpace::CUDA)](parsec_task);
       parsec_ttg_es = safe_es;
-      return PARSEC_HOOK_RETURN_DONE;
+      return ret;
     }
 
     static parsec_key_fn_t parsec_tasks_hash_fcts = {.key_equal = parsec_hash_table_generic_64bits_key_equal,
@@ -872,7 +784,7 @@ namespace ttg_parsec {
     }
 
     inline void release_data_copy(ttg_data_copy_t *copy) {
-      if (copy->is_mutable()) {
+      if (copy->is_mutable() && nullptr == copy->get_next_task()) {
         /* current task mutated the data but there are no consumers so prepare
         * the copy to be freed below */
         copy->reset_readers();
@@ -882,19 +794,24 @@ namespace ttg_parsec {
       if (readers > 1) {
         /* potentially more than one reader, decrement atomically */
         readers = copy->decrement_readers();
-      }
-      /* if there was only one reader (the current task) we release the copy */
-      if (1 == readers) {
-        if (nullptr != copy->push_task) {
+      } else if (readers == 1) {
+        /* make sure readers drop to zero */
+        readers = copy->decrement_readers<false>();
+      }
+      /* if there was only one reader (the current task) or
+       * a mutable copy and a successor, we release the copy */
+      if (1 == readers || copy->is_mutable()) {
+        if (nullptr != copy->get_next_task()) {
           /* Release the deferred task.
-          * The copy was mutable and will be mutated by the released task,
-          * so simply transfer ownership.
-          */
-          parsec_task_t *push_task = copy->push_task;
-          copy->push_task = nullptr;
-          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)push_task;
+           * The copy was mutable and will be mutated by the released task,
+           * so simply transfer ownership.
+           */
+          parsec_task_t *next_task = copy->get_next_task();
+          copy->set_next_task(nullptr);
+          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)next_task;
           deferred_op->release_task();
-        } else {
+        } else if ((1 == copy->num_ref()) || (1 == copy->drop_ref())) {
+          /* we are the last reference, delete the copy */
 #if defined(TTG_PARSEC_DEBUG_TRACK_DATA_COPIES)
           {
             const std::lock_guard<std::mutex> lock(pending_copies_mutex);
@@ -930,10 +847,10 @@ namespace ttg_parsec {
       }
 
       if (readers == copy_in->mutable_tag) {
-        if (copy_res->push_task != nullptr) {
+        if (copy_res->get_next_task() != nullptr) {
           if (readonly) {
-            parsec_ttg_task_base_t *push_task = reinterpret_cast<parsec_ttg_task_base_t *>(copy_res->push_task);
-            if (push_task->defer_writer) {
+            parsec_ttg_task_base_t *next_task = reinterpret_cast<parsec_ttg_task_base_t *>(copy_res->get_next_task());
+            if (next_task->defer_writer) {
               /* there is a writer but it signalled that it wants to wait for readers to complete */
               return copy_res;
             }
@@ -968,13 +885,12 @@ namespace ttg_parsec {
            * of the task
            */
           copy_in->mark_mutable();
-          assert(nullptr == copy_in->push_task);
-          assert(nullptr != task);
-          copy_in->push_task = &task->parsec_task;
+          assert(nullptr == copy_in->get_next_task());
+          copy_in->set_next_task(&task->parsec_task);
         } else {
-          if (task->defer_writer && nullptr == copy_in->push_task) {
+          if (task->defer_writer && nullptr == copy_in->get_next_task()) {
             /* we're the first writer and want to wait for all readers to complete */
-            copy_res->push_task = &task->parsec_task;
+            copy_res->set_next_task(&task->parsec_task);
           } else {
             /* there are writers and/or waiting already of this copy already, make a copy that we can mutate */
             copy_res = NULL;
@@ -983,19 +899,19 @@ namespace ttg_parsec {
       }
 
       if (NULL == copy_res) {
-        ttg_data_copy_t *new_copy = detail::create_new_datacopy(*static_cast<Value *>(copy_in->device_private));
-        if (replace && nullptr != copy_in->push_task) {
+        ttg_data_copy_t *new_copy = detail::create_new_datacopy(*static_cast<Value *>(copy_in->get_ptr()));
+        if (replace && nullptr != copy_in->get_next_task()) {
           /* replace the task that was deferred */
-          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)copy_in->push_task;
+          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)copy_in->get_next_task();
           new_copy->mark_mutable();
           /* replace the copy in the deferred task */
           for (int i = 0; i < deferred_op->data_count; ++i) {
-            if (deferred_op->parsec_task.data[i].data_in == copy_in) {
-              deferred_op->parsec_task.data[i].data_in = new_copy;
+            if (deferred_op->copies[i] == copy_in) {
+              deferred_op->copies[i] = new_copy;
               break;
             }
           }
-          copy_in->push_task = nullptr;
+          copy_in->set_next_task(nullptr);
           deferred_op->release_task();
           copy_in->reset_readers();            // set the copy back to being read-only
           copy_in->increment_readers<false>(); // register as reader
@@ -1012,8 +928,6 @@ namespace ttg_parsec {
 
   }  // namespace detail
 
-  inline thread_local detail::parsec_ttg_task_base_t *parsec_ttg_caller;
-
   inline void ttg_initialize(int argc, char **argv, int num_threads, parsec_context_t *ctx) {
     if (detail::initialized_mpi()) throw std::runtime_error("ttg_parsec::ttg_initialize: can only be called once");
 
@@ -1041,11 +955,13 @@ namespace ttg_parsec {
     if(0 == ttg::default_execution_context().rank())
       ttg::default_execution_context().impl().final_task();
     ttg::detail::set_default_world(ttg::World{});  // reset the default world
+    detail::ptr::drop_all_ptr();
     ttg::detail::destroy_worlds<ttg_parsec::WorldImpl>();
     if (detail::initialized_mpi()) MPI_Finalize();
   }
   inline ttg::World ttg_default_execution_context() { return ttg::get_default_world(); }
-  inline void ttg_abort() { MPI_Abort(ttg_default_execution_context().impl().comm(), 1); }
+  [[noreturn]]
+  inline void ttg_abort() { MPI_Abort(ttg_default_execution_context().impl().comm(), 1); std::abort(); }
   inline void ttg_execute(ttg::World world) { world.impl().execute(); }
   inline void ttg_fence(ttg::World world) { world.impl().fence(); }
 
@@ -1111,8 +1027,14 @@ namespace ttg_parsec {
       unsigned char bytes[WorldImpl::PARSEC_TTG_MAX_AM_SIZE - sizeof(msg_header_t)];
 
       msg_t() = default;
-      msg_t(uint64_t tt_id, uint32_t taskpool_id, msg_header_t::fn_id_t fn_id, int32_t param_id, int num_keys = 1)
-          : tt_id{taskpool_id, tt_id, fn_id, param_id, num_keys} {}
+      msg_t(uint64_t tt_id,
+            uint32_t taskpool_id,
+            msg_header_t::fn_id_t fn_id,
+            int32_t param_id,
+            int sender,
+            int num_keys = 1)
+      : tt_id(fn_id, taskpool_id, tt_id, param_id, sender, num_keys)
+      {}
     };
   }  // namespace detail
 
@@ -1135,7 +1057,7 @@ namespace ttg_parsec {
 
     // check for a non-type member named have_cuda_op
     template <typename T>
-    using have_cuda_op_non_type_t = decltype(&T::have_cuda_op);
+    using have_cuda_op_non_type_t = decltype(T::have_cuda_op);
 
     bool alive = true;
 
@@ -1144,6 +1066,7 @@ namespace ttg_parsec {
     static constexpr int numouts = std::tuple_size_v<output_terminalsT>;       // number of outputs
     static constexpr int numflows = std::max(numins, numouts);                 // max number of flows
 
+   public:
     /// @return true if derivedT::have_cuda_op exists and is defined to true
     static constexpr bool derived_has_cuda_op() {
       if constexpr (ttg::meta::is_detected_v<have_cuda_op_non_type_t, derivedT>) {
@@ -1153,7 +1076,6 @@ namespace ttg_parsec {
       }
     }
 
-   public:
     using ttT = TT;
     using key_type = keyT;
     using input_terminals_type = ttg::detail::input_terminals_tuple_t<keyT, input_tuple_type>;
@@ -1230,6 +1152,13 @@ namespace ttg_parsec {
     constexpr static std::array<void (TT::*)(void *, std::size_t), numinedges> get_from_pull_msg_fcts =
         make_get_from_pull_fcts(std::make_index_sequence<numinedges>{});
 
+    template<std::size_t... IS>
+    constexpr static auto make_input_is_const(std::index_sequence<IS...>) {
+      using resultT = decltype(input_is_const);
+      return resultT{{std::is_const_v<std::tuple_element_t<IS, input_args_type>>...}};
+    }
+    constexpr static std::array<bool, numins> input_is_const = make_input_is_const(std::make_index_sequence<numins>{});
+
     ttg::World world;
     ttg::meta::detail::keymap_t<keyT> keymap;
     ttg::meta::detail::keymap_t<keyT> priomap;
@@ -1246,15 +1175,36 @@ namespace ttg_parsec {
 
    private:
     /// dispatches a call to derivedT::op if Space == Host, otherwise to derivedT::op_cuda if Space == CUDA
+    /// @return void if called a synchronous function, or ttg::coroutine_handle<> if called a coroutine (if non-null,
+    ///    points to the suspended coroutine)
     template <ttg::ExecutionSpace Space, typename... Args>
-    void op(Args &&...args) {
+    auto op(Args &&...args) {
       derivedT *derived = static_cast<derivedT *>(this);
-      if constexpr (Space == ttg::ExecutionSpace::Host)
-        derived->op(std::forward<Args>(args)...);
-      else if constexpr (Space == ttg::ExecutionSpace::CUDA)
-        derived->op_cuda(std::forward<Args>(args)...);
+      // TODO: do we still distinguish op and op_cuda? How do we handle support for multiple devices?
+      //if constexpr (Space == ttg::ExecutionSpace::Host) {
+        using return_type = decltype(derived->op(std::forward<Args>(args)...));
+        if constexpr (std::is_same_v<return_type,void>) {
+          derived->op(std::forward<Args>(args)...);
+          return;
+        }
+        else {
+          return derived->op(std::forward<Args>(args)...);
+        }
+#if 0
+      }
+      else if constexpr (Space == ttg::ExecutionSpace::CUDA) {
+        using return_type = decltype(derived->op_cuda(std::forward<Args>(args)...));
+        if constexpr (std::is_same_v<return_type,void>) {
+          derived->op_cuda(std::forward<Args>(args)...);
+          return;
+        }
+        else {
+          return derived->op_cuda(std::forward<Args>(args)...);
+        }
+      }
       else
-        abort();
+        ttg::abort();
+#endif // 0
     }
 
     template <std::size_t i, typename terminalT, typename Key>
@@ -1277,7 +1227,8 @@ namespace ttg_parsec {
       auto &world_impl = world.impl();
       parsec_taskpool_t *tp = world_impl.taskpool();
       std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), tp->taskpool_id,
-                                                            msg_header_t::MSG_GET_FROM_PULL, i, 1);
+                                                            msg_header_t::MSG_GET_FROM_PULL, i,
+                                                            world.rank(), 1);
       /* pack the key */
       size_t pos = 0;
       pos = pack(key, msg->bytes, pos);
@@ -1299,60 +1250,364 @@ namespace ttg_parsec {
     static input_refs_tuple_type make_tuple_of_ref_from_array(task_t *task, std::index_sequence<IS...>) {
       return input_refs_tuple_type{static_cast<std::tuple_element_t<IS, input_refs_tuple_type>>(
           *reinterpret_cast<std::remove_reference_t<std::tuple_element_t<IS, input_refs_tuple_type>> *>(
-              task->parsec_task.data[IS].data_in->device_private))...};
+              task->copies[IS]->get_ptr()))...};
     }
 
+    /**
+     * Submit callback called by PaRSEC once all input transfers have completed.
+     */
     template <ttg::ExecutionSpace Space>
-    static void static_op(parsec_task_t *parsec_task) {
+    static int device_static_submit(parsec_device_gpu_module_t  *gpu_device,
+                                    parsec_gpu_task_t           *gpu_task,
+                                    parsec_gpu_exec_stream_t    *gpu_stream) {
+
+      task_t *task = (task_t*)gpu_task->ec;
+      // get the device task from the coroutine handle
+      ttg::device_task dev_task = ttg::device_task_handle_type::from_address(task->suspended_task_address);
+
+      task->dev_ptr->stream = gpu_stream;
+
+      std::cout << "device_static_submit task " << task << std::endl;
+
+      // get the promise which contains the views
+      auto dev_data = dev_task.promise();
+
+      /* we should still be waiting for the transfer to complete */
+      assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_TRANSFER);
+
+      /* Here we call back into the coroutine again after the transfers have completed */
+      static_op<Space>(&task->parsec_task);
+
+      /* Get a new handle for the promise*/
+      dev_task = ttg::device_task_handle_type::from_address(task->suspended_task_address);
+      dev_data = dev_task.promise();
+
+      assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_KERNEL ||
+             dev_data.state() == ttg::TTG_DEVICE_CORO_COMPLETE);
+
+      /* we will come back into this function once the kernel and transfers are */
+      int rc = PARSEC_HOOK_RETURN_AGAIN;
+      if (ttg::TTG_DEVICE_CORO_COMPLETE == dev_data.state()) {
+        /* the task started sending so we won't come back here */
+        rc = PARSEC_HOOK_RETURN_DONE;
+      }
+      return rc;
+    }
+
+    template <ttg::ExecutionSpace Space>
+    static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) {
+      static_assert(derived_has_cuda_op());
+
+      int dev_index;
+      double ratio = 1.0;
+
       task_t *task = (task_t*)parsec_task;
-      ttT *baseobj = task->tt;
-      derivedT *obj = static_cast<derivedT *>(baseobj);
-      assert(parsec_ttg_caller == NULL);
-      parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
-      if (obj->tracing()) {
-        if constexpr (!ttg::meta::is_void_v<keyT>)
-          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": executing");
-        else
-          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : executing");
-      }
-
-      if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-        auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence<numinvals>{});
-        baseobj->template op<Space>(task->key, std::move(input), obj->output_terminals);
-      } else if constexpr (!ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-        baseobj->template op<Space>(task->key, obj->output_terminals);
-      } else if constexpr (ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-        auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence<numinvals>{});
-        baseobj->template op<Space>(std::move(input), obj->output_terminals);
-      } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-        baseobj->template op<Space>(obj->output_terminals);
-      } else {
-        abort();
+      parsec_execution_stream_s *es = task->tt->world.impl().execution_stream();
+
+      std::cout << "device_static_op: task " << parsec_task << std::endl;
+
+
+      /* set up a device task */
+      parsec_gpu_task_t *gpu_task;
+      /* PaRSEC wants to free the gpu_task, because F***K ownerships */
+      gpu_task = static_cast<parsec_gpu_task_t*>(std::calloc(1, sizeof(*gpu_task)));
+      PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
+      gpu_task->ec = parsec_task;
+      gpu_task->task_type = 0; // user task
+      gpu_task->load = 1.0;    // TODO: can we do better?
+      gpu_task->last_data_check_epoch = -1; // used internally
+      gpu_task->pushout = 0;
+      gpu_task->submit = &TT::device_static_submit<Space>;
+
+      /* set the gpu_task so it's available in register_device_memory */
+      task->dev_ptr->gpu_task = gpu_task;
+
+      // first invocation of the coroutine to get the coroutine handle
+      static_op<Space>(parsec_task);
+
+      /* when we come back here, the flows in gpu_task are set (see register_device_memory) */
+
+      // get the device task from the coroutine handle
+      auto dev_task = ttg::device_task_handle_type::from_address(task->suspended_task_address);
+
+      // get the promise which contains the views
+      ttg::device_task_promise_type& dev_data = dev_task.promise();
+
+      /* for now make sure we're waiting for transfers and the coro hasn't skipped this step */
+      assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_TRANSFER);
+
+      /* TODO: is this the right place to set the mask? */
+      task->parsec_task.chore_mask = PARSEC_DEV_ALL;
+      /* get a device and come back if we need another one */
+      dev_index = parsec_get_best_device(parsec_task, ratio);
+      assert(dev_index >= 0);
+      if (dev_index < 2) {
+          return PARSEC_HOOK_RETURN_NEXT; /* Fall back */
+      }
+
+
+#if 0
+      // manage the gpu_task flows
+
+      // set the input flows
+      uint8_t i = 0;
+      /* TODO: need to free flows at the end of the task lifetime */
+      parsec_flow_t* flows = new parsec_flow_t[MAX_PARAM_COUNT];
+      for (auto& view : dev_data) {
+        void *host_obj = view.host_obj();
+
+        /* iterate over all viewspans of this view (i.e., all memory ranges in this view) */
+        for (auto& view_span : view) {
+          gpu_task->flow_nb_elts[i] = view_span.size(); // size in bytes
+          gpu_task->flow[i] = &flows[i];
+          parsec_flow_t flow = {.name = nullptr,
+                                .sym_type = PARSEC_SYM_INOUT,
+                                .flow_flags = view_span.is_sync_out() ? PARSEC_FLOW_ACCESS_RW : PARSEC_FLOW_ACCESS_READ,
+                                .flow_index = i,
+                                .flow_datatype_mask = ~0 };
+          std::cout << "view_span.is_sync_out() " << view_span.is_sync_out() << std::endl;
+          *(parsec_flow_t*)gpu_task->flow[i] = flow; // why are flows constant?!
+
+          parsec_data_copy_t* copy = nullptr;
+          ttg_parsec::detail::ttg_data_copy_t* obj_copy = nullptr;
+          int input_obj_idx = -1;
+          /* try to find the view in the task and allocate a new copy if needed */
+          for (int i = 0; nullptr == copy && i < numins; ++i) {
+            ttg_parsec::detail::ttg_data_copy_t* obj_copy = task->copies[i];
+            if (obj_copy->get_ptr() == host_obj) {
+              for (auto& dev_copy : *obj_copy) {
+                if (view_span.data() == dev_copy->device_private) {
+                  copy = dev_copy;
+                  input_obj_idx = i;
+                  break;
+                }
+              }
+            }
+          }
+
+          /* push all data back out, EXCEPT if the host object is a const input for the task
+           * TODO [JS]: if PaRSEC lets us send from the device we can avoid pushing out here
+           */
+          if (input_obj_idx == -1 || !input_is_const[input_obj_idx]) {
+            gpu_task->pushout |= 1<<i;
+          }
+
+          /* no copy found, create a new copy */
+          if (nullptr == copy) {
+            parsec_data_t *data = parsec_data_new();
+            copy = parsec_data_copy_new(data, 0, parsec_datatype_int8_t, PARSEC_DATA_FLAG_PARSEC_MANAGED);
+            copy->device_private = view_span.data();
+            copy->version = 1; // this version is valid
+            data->nb_elts = view_span.size();
+            data->owner_device = 0;
+            copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+            std::cout << "copy " << copy << " device_private " << copy->device_private << std::endl;
+          }
+
+          if (obj_copy != nullptr) {
+            /* add the */
+            obj_copy->add_device_copy(copy);
+          }
+          /* register the copy with the task */
+          task->parsec_task.data[i].data_in = copy;
+          task->parsec_task.data[i].source_repo_entry = NULL;
+
+          ++i;
+        }
       }
-      parsec_ttg_caller = NULL;
 
-      if (obj->tracing()) {
-        if constexpr (!ttg::meta::is_void_v<keyT>)
-          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": done executing");
-        else
-          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : done executing");
+      /* mark all other flows as ignored */
+      for (; i < MAX_PARAM_COUNT; ++i) {
+        gpu_task->flow_nb_elts[i] = 0;
+        flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
+        flows[i].flow_index = i;
+        gpu_task->flow[i] = &flows[i];
+        task->parsec_task.data[i].data_in = nullptr;
+        task->parsec_task.data[i].source_repo_entry = NULL;
+      }
+#endif // 0
+
+      parsec_device_module_t *device = parsec_mca_device_get(dev_index);
+      assert(NULL != device);
+      switch(device->type) {
+
+#if defined(PARSEC_HAVE_CUDA)
+        case PARSEC_DEV_CUDA:
+          if constexpr (Space == ttg::ExecutionSpace::CUDA) {
+            /* TODO: we need custom staging functions because PaRSEC looks at the
+             *       task-class to determine the number of flows. */
+            gpu_task->stage_in  = parsec_default_cuda_stage_in;
+            gpu_task->stage_out = parsec_default_cuda_stage_out;
+            return parsec_cuda_kernel_scheduler(es, gpu_task, dev_index);
+          }
+          break;
+#endif
+        default:
+          break;
+      }
+      ttg::print_error(task->tt->get_name(), " : received mismatching device type ", device->type, " from PaRSEC");
+      ttg::abort();
+      return PARSEC_HOOK_RETURN_DONE; // will not be reacehed
+    }
+
+    template <ttg::ExecutionSpace Space>
+    static parsec_hook_return_t static_op(parsec_task_t *parsec_task) {
+
+      task_t *task = (task_t*)parsec_task;
+      void* suspended_task_address =
+#ifdef TTG_HAS_COROUTINE
+        task->suspended_task_address;  // non-null = need to resume the task
+#else
+        nullptr;
+#endif
+      //std::cout << "static_op: suspended_task_address " << suspended_task_address << std::endl;
+      if (suspended_task_address == nullptr) {  // task is a coroutine that has not started or an ordinary function
+
+        ttT *baseobj = task->tt;
+        derivedT *obj = static_cast<derivedT *>(baseobj);
+        assert(detail::parsec_ttg_caller == nullptr);
+        detail::parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
+        if (obj->tracing()) {
+          if constexpr (!ttg::meta::is_void_v<keyT>)
+            ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": executing");
+          else
+            ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : executing");
+        }
+
+        if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+          auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence<numinvals>{});
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op<Space>(task->key, std::move(input), obj->output_terminals));
+        } else if constexpr (!ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op<Space>(task->key, obj->output_terminals));
+        } else if constexpr (ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+          auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence<numinvals>{});
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op<Space>(std::move(input), obj->output_terminals));
+        } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op<Space>(obj->output_terminals));
+        } else {
+          ttg::abort();
+        }
+        detail::parsec_ttg_caller = nullptr;
+      }
+      else {  // resume the suspended coroutine
+        auto coro = static_cast<ttg::device_task>(ttg::device_task_handle_type::from_address(suspended_task_address));
+        assert(detail::parsec_ttg_caller == nullptr);
+        detail::parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
+        // TODO: unify the outputs tls handling
+        auto old_output_tls_ptr = task->tt->outputs_tls_ptr_accessor();
+        task->tt->set_outputs_tls_ptr();
+        coro.resume();
+        if (coro.completed()) {
+          coro.destroy();
+          suspended_task_address = nullptr;
+        }
+        task->tt->set_outputs_tls_ptr(old_output_tls_ptr);
+        detail::parsec_ttg_caller = nullptr;
+#if 0
+#ifdef TTG_HAS_COROUTINE
+        auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<>::from_address(suspended_task_address));
+        assert(ret.ready());
+        ret.resume();
+        if (ret.completed()) {
+          ret.destroy();
+          suspended_task_address = nullptr;
+        }
+        else { // not yet completed
+          // leave suspended_task_address as is
+        }
+        task->suspended_task_address = suspended_task_address;
+#else
+#endif // 0
+        ttg::abort();  // should not happen
+#endif
+      }
+      task->suspended_task_address = suspended_task_address;
+
+      if (suspended_task_address == nullptr) {
+        ttT *baseobj = task->tt;
+        derivedT *obj = static_cast<derivedT *>(baseobj);
+        if (obj->tracing()) {
+          if constexpr (!ttg::meta::is_void_v<keyT>)
+            ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": done executing");
+          else
+            ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : done executing");
+        }
+      }
+
+// XXX the below code is not needed, should be removed once the fib test has been changed
+#if 0
+#ifdef TTG_HAS_COROUTINE
+      if (suspended_task_address) {
+        // right now can events are not properly implemented, we are only testing the workflow with dummy events
+        // so mark the events finished manually, parsec will rerun this task again and it should complete the second time
+        auto events = static_cast<ttg::resumable_task>(ttg::coroutine_handle<>::from_address(suspended_task_address)).events();
+        for (auto &event_ptr : events) {
+          event_ptr->finish();
+        }
+        assert(ttg::coroutine_handle<>::from_address(suspended_task_address).promise().ready());
+
+        // TODO: shove {ptr to parsec_task, ptr to this function} to the list of tasks suspended by this thread (hence stored in TLS)
+        // thread will loop over its list (after running every task? periodically? need a dedicated queue of ready tasks?)
+        // and resume the suspended tasks whose events are ready (N.B. ptr to parsec_task is enough to get the list of pending events)
+        // event clearance will in device case handled by host callbacks run by the dedicated device runtime thread
+
+        // TODO PARSEC_HOOK_RETURN_AGAIN -> PARSEC_HOOK_RETURN_ASYNC when event tracking and task resumption (by this thread) is ready
+        return PARSEC_HOOK_RETURN_AGAIN;
       }
+      else
+#endif  // TTG_HAS_COROUTINE
+#endif // 0
+        return PARSEC_HOOK_RETURN_DONE;
     }
 
     template <ttg::ExecutionSpace Space>
-    static void static_op_noarg(parsec_task_t *parsec_task) {
+    static parsec_hook_return_t static_op_noarg(parsec_task_t *parsec_task) {
       task_t *task = static_cast<task_t*>(parsec_task);
-      ttT *baseobj = (ttT *)task->object_ptr;
-      derivedT *obj = (derivedT *)task->object_ptr;
-      assert(parsec_ttg_caller == NULL);
-      parsec_ttg_caller = task;
-      if constexpr (!ttg::meta::is_void_v<keyT>) {
-        baseobj->template op<Space>(task->key, obj->output_terminals);
-      } else if constexpr (ttg::meta::is_void_v<keyT>) {
-        baseobj->template op<Space>(obj->output_terminals);
-      } else
-        abort();
-      parsec_ttg_caller = NULL;
+
+      void* suspended_task_address =
+#ifdef TTG_HAS_COROUTINE
+        task->suspended_task_address;  // non-null = need to resume the task
+#else
+        nullptr;
+#endif
+      if (suspended_task_address == nullptr) {  // task is a coroutine that has not started or an ordinary function
+        ttT *baseobj = (ttT *)task->object_ptr;
+        derivedT *obj = (derivedT *)task->object_ptr;
+        assert(detail::parsec_ttg_caller == NULL);
+        detail::parsec_ttg_caller = task;
+        if constexpr (!ttg::meta::is_void_v<keyT>) {
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op<Space>(task->key, obj->output_terminals));
+        } else if constexpr (ttg::meta::is_void_v<keyT>) {
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op<Space>(obj->output_terminals));
+        } else  // unreachable
+          ttg:: abort();
+        detail::parsec_ttg_caller = NULL;
+      }
+      else {
+#ifdef TTG_HAS_COROUTINE
+        auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<>::from_address(suspended_task_address));
+        assert(ret.ready());
+        ret.resume();
+        if (ret.completed()) {
+          ret.destroy();
+          suspended_task_address = nullptr;
+        }
+        else { // not yet completed
+          // leave suspended_task_address as is
+        }
+#else
+        ttg::abort();  // should not happen
+#endif
+      }
+      task->suspended_task_address = suspended_task_address;
+
+      if (suspended_task_address) {
+        ttg::abort();  // not yet implemented
+        // see comments in static_op()
+        return PARSEC_HOOK_RETURN_AGAIN;
+      }
+      else
+        return PARSEC_HOOK_RETURN_DONE;
     }
 
    protected:
@@ -1372,9 +1627,14 @@ namespace ttg_parsec {
     }
 
     template <typename T>
-    uint64_t pack(T &obj, void *bytes, uint64_t pos) {
+    uint64_t pack(T &obj, void *bytes, uint64_t pos, detail::ttg_data_copy_t *copy = nullptr) {
       const ttg_data_descriptor *dObj = ttg::get_data_descriptor<ttg::meta::remove_cvr_t<T>>();
       uint64_t payload_size = dObj->payload_size(&obj);
+      if (copy) {
+        /* reset any tracked data, we don't care about the packing from the payload size */
+        copy->iovec_reset();
+      }
+
       if constexpr (!ttg::default_data_descriptor<ttg::meta::remove_cvr_t<T>>::serialize_size_is_const) {
         const ttg_data_descriptor *dSiz = ttg::get_data_descriptor<uint64_t>();
         dSiz->pack_payload(&payload_size, sizeof(uint64_t), pos, bytes);
@@ -1398,7 +1658,7 @@ namespace ttg_parsec {
             (obj->*member)(data, size);
           } else {
             // there is no good reason to have negative param ids
-            abort();
+            ttg::abort();
           }
           break;
         }
@@ -1424,7 +1684,7 @@ namespace ttg_parsec {
           break;
         }
         default:
-          abort();
+          ttg::abort();
       }
     }
 
@@ -1447,19 +1707,19 @@ namespace ttg_parsec {
       // TODO: do we need to copy static_stream_goal in dummy?
 
       /* set the received value as the dummy's only data */
-      dummy->parsec_task.data[0].data_in = copy;
+      dummy->copies[0] = copy;
 
       /* We received the task on this world, so it's using the same taskpool */
       dummy->parsec_task.taskpool = world.impl().taskpool();
 
       /* save the current task and set the dummy task */
-      auto parsec_ttg_caller_save = parsec_ttg_caller;
-      parsec_ttg_caller = dummy;
+      auto parsec_ttg_caller_save = detail::parsec_ttg_caller;
+      detail::parsec_ttg_caller = dummy;
 
       /* iterate over the keys and have them use the copy we made */
       parsec_task_t *task_ring = nullptr;
       for (auto &&key : keylist) {
-        set_arg_local_impl<i>(key, *reinterpret_cast<valueT *>(copy->device_private), copy, &task_ring);
+        set_arg_local_impl<i>(key, *reinterpret_cast<valueT *>(copy->get_ptr()), copy, &task_ring);
       }
 
       if (nullptr != task_ring) {
@@ -1468,7 +1728,7 @@ namespace ttg_parsec {
       }
 
       /* restore the previous task */
-      parsec_ttg_caller = parsec_ttg_caller_save;
+      detail::parsec_ttg_caller = parsec_ttg_caller_save;
 
       /* release the dummy task */
       complete_task_and_release(es, &dummy->parsec_task);
@@ -1491,7 +1751,9 @@ namespace ttg_parsec {
       msg_t *msg = static_cast<msg_t *>(data);
       if constexpr (!ttg::meta::is_void_v<keyT>) {
         /* unpack the keys */
-        uint64_t pos = 0;
+        /* TODO: can we avoid copying all the keys?! */
+        uint64_t pos = msg->tt_id.key_offset;
+        uint64_t key_end_pos;
         std::vector<keyT> keylist;
         int num_keys = msg->tt_id.num_keys;
         keylist.reserve(num_keys);
@@ -1502,91 +1764,98 @@ namespace ttg_parsec {
           assert(keymap(key) == rank);
           keylist.push_back(std::move(key));
         }
+        key_end_pos = pos;
+        /* jump back to the beginning of the message to get the value */
+        pos = 0;
         // case 1
         if constexpr (!ttg::meta::is_void_v<valueT>) {
           using decvalueT = std::decay_t<valueT>;
-          if constexpr (!ttg::has_split_metadata<decvalueT>::value) {
-            detail::ttg_data_copy_t *copy = detail::create_new_datacopy(decvalueT{});
-            unpack(*static_cast<decvalueT *>(copy->device_private), msg->bytes, pos);
-
-            set_arg_from_msg_keylist<i, decvalueT>(ttg::span<keyT>(&keylist[0], num_keys), copy);
-          } else {
-            /* unpack the header and start the RMA transfers */
+          int32_t num_iovecs = msg->tt_id.num_iovecs;
+          detail::ttg_data_copy_t *copy;
+          if constexpr (ttg::has_split_metadata<decvalueT>::value) {
             ttg::SplitMetadataDescriptor<decvalueT> descr;
             using metadata_t = decltype(descr.get_metadata(std::declval<decvalueT>()));
-            size_t metadata_size = sizeof(metadata_t);
 
             /* unpack the metadata */
             metadata_t metadata;
-            std::memcpy(&metadata, msg->bytes + pos, metadata_size);
-            pos += metadata_size;
+            pos = unpack(metadata, msg->bytes, pos);
+
+            copy = detail::create_new_datacopy(descr.create_from_metadata(metadata));
+          } else if constexpr (!ttg::has_split_metadata<decvalueT>::value) {
+            copy = detail::create_new_datacopy(decvalueT{});
+            /* unpack the object, potentially discovering iovecs */
+            pos = unpack(*static_cast<decvalueT *>(copy->get_ptr()), msg->bytes, pos);
+            std::cout << "num_iovecs " << num_iovecs << " distance " << std::distance(copy->iovec_begin(), copy->iovec_end()) << std::endl;
+            assert(std::distance(copy->iovec_begin(), copy->iovec_end()) == num_iovecs);
+          }
 
-            /* unpack the remote rank */
-            int remote;
-            std::memcpy(&remote, msg->bytes + pos, sizeof(remote));
-            pos += sizeof(remote);
+          if (num_iovecs == 0) {
+            set_arg_from_msg_keylist<i, decvalueT>(ttg::span<keyT>(&keylist[0], num_keys), copy);
+          } else {
+            /* unpack the header and start the RMA transfers */
 
+            /* get the remote rank */
+            int remote = msg->tt_id.sender;
             assert(remote < world.size());
 
-            /* extract the number of chunks */
-            int32_t num_iovecs;
-            std::memcpy(&num_iovecs, msg->bytes + pos, sizeof(num_iovecs));
-            pos += sizeof(num_iovecs);
-
-            detail::ttg_data_copy_t *copy = detail::create_new_datacopy(descr.create_from_metadata(metadata));
             /* nothing else to do if the object is empty */
-            if (0 == num_iovecs) {
-              set_arg_from_msg_keylist<i, decvalueT>(keylist, copy);
-            } else {
-              /* extract the callback tag */
-              parsec_ce_tag_t cbtag;
-              std::memcpy(&cbtag, msg->bytes + pos, sizeof(cbtag));
-              pos += sizeof(cbtag);
-
-              /* create the value from the metadata */
-              auto activation = new detail::rma_delayed_activate(
-                  std::move(keylist), copy, num_iovecs, [this](std::vector<keyT> &&keylist, detail::ttg_data_copy_t *copy) {
-                    set_arg_from_msg_keylist<i, decvalueT>(keylist, copy);
-                    this->world.impl().decrement_inflight_msg();
-                  });
-              auto &val = *static_cast<decvalueT *>(copy->device_private);
-
-              using ActivationT = std::decay_t<decltype(*activation)>;
-
-              int nv = 0;
-              /* process payload iovecs */
-              auto iovecs = descr.get_data(val);
-              /* start the RMA transfers */
-              for (auto &&iov : iovecs) {
-                ++nv;
-                parsec_ce_mem_reg_handle_t rreg;
-                int32_t rreg_size_i;
-                std::memcpy(&rreg_size_i, msg->bytes + pos, sizeof(rreg_size_i));
-                pos += sizeof(rreg_size_i);
-                rreg = static_cast<parsec_ce_mem_reg_handle_t>(msg->bytes + pos);
-                pos += rreg_size_i;
-                // std::intptr_t *fn_ptr = reinterpret_cast<std::intptr_t *>(msg->bytes + pos);
-                // pos += sizeof(*fn_ptr);
-                std::intptr_t fn_ptr;
-                std::memcpy(&fn_ptr, msg->bytes + pos, sizeof(fn_ptr));
-                pos += sizeof(fn_ptr);
-
-                /* register the local memory */
-                parsec_ce_mem_reg_handle_t lreg;
-                size_t lreg_size;
-                parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t,
-                                       iov.num_bytes, &lreg, &lreg_size);
-                world.impl().increment_inflight_msg();
-                /* TODO: PaRSEC should treat the remote callback as a tag, not a function pointer! */
-                parsec_ce.get(&parsec_ce, lreg, 0, rreg, 0, iov.num_bytes, remote,
-                              &detail::get_complete_cb<ActivationT>, activation,
-                              /*world.impl().parsec_ttg_rma_tag()*/
-                              cbtag, &fn_ptr, sizeof(std::intptr_t));
-              }
-
-              assert(num_iovecs == nv);
-              assert(size == (pos + sizeof(msg_header_t)));
+            /* extract the callback tag */
+            parsec_ce_tag_t cbtag;
+            std::memcpy(&cbtag, msg->bytes + pos, sizeof(cbtag));
+            pos += sizeof(cbtag);
+
+            /* create the value from the metadata */
+            auto activation = new detail::rma_delayed_activate(
+                std::move(keylist), copy, num_iovecs, [this](std::vector<keyT> &&keylist, detail::ttg_data_copy_t *copy) {
+                  set_arg_from_msg_keylist<i, decvalueT>(keylist, copy);
+                  this->world.impl().decrement_inflight_msg();
+                });
+            auto &val = *static_cast<decvalueT *>(copy->get_ptr());
+
+            using ActivationT = std::decay_t<decltype(*activation)>;
+
+            int nv = 0;
+            /* start the RMA transfers */
+            auto handle_iovecs_fn =
+              [&](auto&& iovecs) {
+                  for (auto &&iov : iovecs) {
+                    ++nv;
+                    parsec_ce_mem_reg_handle_t rreg;
+                    int32_t rreg_size_i;
+                    std::memcpy(&rreg_size_i, msg->bytes + pos, sizeof(rreg_size_i));
+                    pos += sizeof(rreg_size_i);
+                    rreg = static_cast<parsec_ce_mem_reg_handle_t>(msg->bytes + pos);
+                    pos += rreg_size_i;
+                    // std::intptr_t *fn_ptr = reinterpret_cast<std::intptr_t *>(msg->bytes + pos);
+                    // pos += sizeof(*fn_ptr);
+                    std::intptr_t fn_ptr;
+                    std::memcpy(&fn_ptr, msg->bytes + pos, sizeof(fn_ptr));
+                    pos += sizeof(fn_ptr);
+
+                    /* register the local memory */
+                    parsec_ce_mem_reg_handle_t lreg;
+                    size_t lreg_size;
+                    parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t,
+                                            iov.num_bytes, &lreg, &lreg_size);
+                    world.impl().increment_inflight_msg();
+                    /* TODO: PaRSEC should treat the remote callback as a tag, not a function pointer! */
+                    std::cout << "set_arg_from_msg: get rreg " << rreg << " remote " << remote << std::endl;
+                    parsec_ce.get(&parsec_ce, lreg, 0, rreg, 0, iov.num_bytes, remote,
+                                  &detail::get_complete_cb<ActivationT>, activation,
+                                  /*world.impl().parsec_ttg_rma_tag()*/
+                                  cbtag, &fn_ptr, sizeof(std::intptr_t));
+                  }
+            };
+            if constexpr (ttg::has_split_metadata<decvalueT>::value) {
+              ttg::SplitMetadataDescriptor<decvalueT> descr;
+              handle_iovecs_fn(descr.get_data(val));
+            } else if constexpr (!ttg::has_split_metadata<decvalueT>::value) {
+              handle_iovecs_fn(copy->iovec_span());
+              copy->iovec_reset();
             }
+
+            assert(num_iovecs == nv);
+            assert(size == (key_end_pos + sizeof(msg_header_t)));
           }
           // case 2 and 3
         } else if constexpr (!ttg::meta::is_void_v<keyT> && std::is_void_v<valueT>) {
@@ -1604,8 +1873,8 @@ namespace ttg_parsec {
         // case 5 and 6
       } else if constexpr (ttg::meta::is_void_v<keyT> && std::is_void_v<valueT>) {
         set_arg<i, keyT, ttg::Void>(ttg::Void{});
-      } else {
-        abort();
+      } else {  // unreachable
+        ttg::abort();
       }
     }
 
@@ -1717,7 +1986,7 @@ namespace ttg_parsec {
           reinterpret_cast<detail::parsec_static_op_t>(&TT::static_op<ttg::ExecutionSpace::Host>);
       if constexpr (derived_has_cuda_op())
         newtask->function_template_class_ptr[static_cast<std::size_t>(ttg::ExecutionSpace::CUDA)] =
-            reinterpret_cast<detail::parsec_static_op_t>(&TT::static_op<ttg::ExecutionSpace::CUDA>);
+            reinterpret_cast<detail::parsec_static_op_t>(&TT::device_static_op<ttg::ExecutionSpace::CUDA>);
 
       for (int i = 0; i < static_stream_goal.size(); ++i) {
         newtask->stream[i].goal = static_stream_goal[i];
@@ -1790,8 +2059,8 @@ namespace ttg_parsec {
 
       if( world_impl.dag_profiling() ) {
 #if defined(PARSEC_PROF_GRAPHER)
-        if(NULL != parsec_ttg_caller && !parsec_ttg_caller->dummy()) {
-          int orig_index = detail::find_index_of_copy_in_task(parsec_ttg_caller, &value);
+        if(NULL != detail::parsec_ttg_caller && !detail::parsec_ttg_caller->dummy()) {
+          int orig_index = detail::find_index_of_copy_in_task(detail::parsec_ttg_caller, &value);
           char orig_str[32];
           char dest_str[32];
           if(orig_index >= 0) {
@@ -1804,7 +2073,7 @@ namespace ttg_parsec {
                               .flow_index = 0, .flow_datatype_mask = ~0 };
           parsec_flow_t dest{ .name = dest_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW,
                               .flow_index = 0, .flow_datatype_mask = ~0 };
-          parsec_prof_grapher_dep(&parsec_ttg_caller->parsec_task, &task->parsec_task, discover_task ? 1 : 0, &orig, &dest);
+          parsec_prof_grapher_dep(&detail::parsec_ttg_caller->parsec_task, &task->parsec_task, discover_task ? 1 : 0, &orig, &dest);
         }
 #endif
       }
@@ -1817,14 +2086,14 @@ namespace ttg_parsec {
         if constexpr (!ttg::meta::is_void_v<valueT>) {  // for data values
           // have a value already? if not, set, otherwise reduce
           detail::ttg_data_copy_t *copy = nullptr;
-          if (nullptr == (copy = static_cast<detail::ttg_data_copy_t *>(task->parsec_task.data[i].data_in))) {
+          if (nullptr == (copy = task->copies[i])) {
             using decay_valueT = std::decay_t<valueT>;
             /* For now, we always create a copy because we cannot rely on the task_release
              * mechanism (it would release the task, not the reduction value). */
             copy = detail::create_new_datacopy(std::forward<Value>(value));
-            task->parsec_task.data[i].data_in = copy;
+            task->copies[i] = copy;
           } else {
-            reducer(*reinterpret_cast<std::decay_t<valueT> *>(copy->device_private), value);
+            reducer(*reinterpret_cast<std::decay_t<valueT> *>(copy->get_ptr()), value);
           }
         } else {
           reducer();  // even if this was a control input, must execute the reducer for possible side effects
@@ -1839,14 +2108,14 @@ namespace ttg_parsec {
       } else {
         /* whether the task needs to be deferred or not */
         if constexpr (!valueT_is_Void) {
-          if (nullptr != task->parsec_task.data[i].data_in) {
+          if (nullptr != task->copies[i]) {
             ttg::print_error(get_name(), " : ", key, ": error argument is already set : ", i);
             throw std::logic_error("bad set arg");
           }
 
           detail::ttg_data_copy_t *copy = copy_in;
-          if (nullptr == copy_in && nullptr != parsec_ttg_caller) {
-            copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
+          if (nullptr == copy_in && nullptr != detail::parsec_ttg_caller) {
+            copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
           }
 
           if (nullptr != copy) {
@@ -1855,11 +2124,12 @@ namespace ttg_parsec {
           } else {
             copy = detail::create_new_datacopy(std::forward<Value>(value));
           }
+
           /* if we registered as a writer and were the first to register with this copy
            * we need to defer the release of this task to give other tasks a chance to
            * make a copy of the original data */
-          release = (copy->push_task != &task->parsec_task);
-          task->parsec_task.data[i].data_in = copy;
+          release = (copy->get_next_task() != &task->parsec_task);
+          task->copies[i] = copy;
         }
       }
       task->remove_from_hash = remove_from_hash;
@@ -1947,7 +2217,7 @@ namespace ttg_parsec {
 
     // Used to set the i'th argument
     template <std::size_t i, typename Key, typename Value>
-    void set_arg_impl(const Key &key, Value &&value) {
+    void set_arg_impl(const Key &key, Value &&value, detail::ttg_data_copy_t *copy_in = nullptr) {
       int owner;
 
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
@@ -1962,9 +2232,9 @@ namespace ttg_parsec {
         owner = keymap();
       if (owner == world.rank()) {
         if constexpr (!ttg::meta::is_void_v<keyT>)
-          set_arg_local<i, keyT, Value>(key, std::forward<Value>(value));
+          set_arg_local_impl<i>(key, std::forward<Value>(value), copy_in);
         else
-          set_arg_local<i, keyT, Value>(std::forward<Value>(value));
+          set_arg_local_impl<i>(ttg::Void{}, std::forward<Value>(value), copy_in);
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
           if(world.impl().profiling()) {
             parsec_profiling_ts_trace(world.impl().parsec_ttg_profile_backend_set_arg_end, 0, 0, NULL);
@@ -1978,44 +2248,24 @@ namespace ttg_parsec {
       using msg_t = detail::msg_t;
       auto &world_impl = world.impl();
       uint64_t pos = 0;
+      int num_iovecs = 0;
       std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                           msg_header_t::MSG_SET_ARG, i, 1);
+                                                           msg_header_t::MSG_SET_ARG, i, world_impl.rank(), 1);
       using decvalueT = std::decay_t<Value>;
-      /* pack the key */
-      msg->tt_id.num_keys = 0;
-      if constexpr (!ttg::meta::is_void_v<Key>) {
-        pos = pack(key, msg->bytes, pos);
-        msg->tt_id.num_keys = 1;
-      }
 
       if constexpr (!ttg::meta::is_void_v<decvalueT>) {
-        if constexpr (!ttg::has_split_metadata<decvalueT>::value) {
-          pos = pack(value, msg->bytes, pos);
-        } else {
-          detail::ttg_data_copy_t *copy;
-          copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
+
+        detail::ttg_data_copy_t *copy = copy_in;
+        /* make sure we have a data copy to register with */
+        if (nullptr == copy) {
+          copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
           if (nullptr == copy) {
             // We need to create a copy for this data, as it does not exist yet.
             copy = detail::create_new_datacopy(std::forward<Value>(value));
           }
-          copy = detail::register_data_copy<decvalueT>(copy, nullptr, true);
-
-          ttg::SplitMetadataDescriptor<decvalueT> descr;
-          auto metadata = descr.get_metadata(value);
-          size_t metadata_size = sizeof(metadata);
-          /* pack the metadata */
-          std::memcpy(msg->bytes + pos, &metadata, metadata_size);
-          pos += metadata_size;
-          /* pack the local rank */
-          int rank = world.rank();
-          std::memcpy(msg->bytes + pos, &rank, sizeof(rank));
-          pos += sizeof(rank);
-
-          auto iovecs = descr.get_data(*static_cast<decvalueT *>(copy->device_private));
+        }
 
-          int32_t num_iovs = std::distance(std::begin(iovecs), std::end(iovecs));
-          std::memcpy(msg->bytes + pos, &num_iovs, sizeof(num_iovs));
-          pos += sizeof(num_iovs);
+        auto handle_iovec_fn = [&](auto&& iovecs){
 
           /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a
            * raw function pointer instead of a preregistered AM tag, so play that game.
@@ -2029,6 +2279,7 @@ namespace ttg_parsec {
            * memory layout: [<lreg_size, lreg, release_cb_ptr>, ...]
            */
           for (auto &&iov : iovecs) {
+            copy = detail::register_data_copy<decvalueT>(copy, nullptr, true);
             parsec_ce_mem_reg_handle_t lreg;
             size_t lreg_size;
             /* TODO: only register once when we can broadcast the data! */
@@ -2041,8 +2292,9 @@ namespace ttg_parsec {
             int32_t lreg_size_i = lreg_size;
             std::memcpy(msg->bytes + pos, &lreg_size_i, sizeof(lreg_size_i));
             pos += sizeof(lreg_size_i);
-            std::memcpy(msg->bytes + pos, lreg, lreg_size_i);
-            pos += lreg_size_i;
+            std::memcpy(msg->bytes + pos, lreg, lreg_size);
+            pos += lreg_size;
+            std::cout << "set_arg_impl lreg " << lreg << std::endl;
             /* TODO: can we avoid the extra indirection of going through std::function? */
             std::function<void(void)> *fn = new std::function<void(void)>([=]() mutable {
               /* shared_ptr of value and registration captured by value so resetting
@@ -2054,12 +2306,44 @@ namespace ttg_parsec {
             std::memcpy(msg->bytes + pos, &fn_ptr, sizeof(fn_ptr));
             pos += sizeof(fn_ptr);
           }
+        };
+
+        if constexpr (ttg::has_split_metadata<std::decay_t<Value>>::value) {
+          ttg::SplitMetadataDescriptor<decvalueT> descr;
+          auto iovs = descr.get_data(*const_cast<decvalueT *>(&value));
+          num_iovecs = std::distance(std::begin(iovs), std::end(iovs));
+          /* pack the metadata */
+          auto metadata = descr.get_metadata(value);
+          size_t metadata_size = sizeof(metadata);
+          pos = pack(metadata, msg->bytes, pos);
+          handle_iovec_fn(iovs);
+        } else if constexpr (!ttg::has_split_metadata<std::decay_t<Value>>::value) {
+          /* serialize the object */
+          std::cout << "PRE pack num_iovecs " << std::distance(copy->iovec_begin(), copy->iovec_end()) << std::endl;
+          pos = pack(value, msg->bytes, pos, copy);
+          num_iovecs = std::distance(copy->iovec_begin(), copy->iovec_end());
+          std::cout << "POST pack num_iovecs " << num_iovecs << std::endl;
+          /* handle any iovecs contained in it */
+          handle_iovec_fn(copy->iovec_span());
+          copy->iovec_reset();
         }
+
+        msg->tt_id.num_iovecs = num_iovecs;
       }
+
+      /* pack the key */
+      msg->tt_id.num_keys = 0;
+      msg->tt_id.key_offset = pos;
+      if constexpr (!ttg::meta::is_void_v<Key>) {
+        size_t tmppos = pack(key, msg->bytes, pos);
+        pos = tmppos;
+        msg->tt_id.num_keys = 1;
+      }
+
       parsec_taskpool_t *tp = world_impl.taskpool();
       tp->tdm.module->outgoing_message_start(tp, owner, NULL);
       tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0);
-      // std::cout << "Sending AM with " << msg->op_id.num_keys << " keys " << std::endl;
+      std::cout << "set_arg_impl send_am owner " << owner << " sender " << msg->tt_id.sender << std::endl;
       parsec_ce.send_am(&parsec_ce, world_impl.parsec_ttg_tag(), owner, static_cast<void *>(msg.get()),
                         sizeof(msg_header_t) + pos);
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
@@ -2068,8 +2352,8 @@ namespace ttg_parsec {
       }
 #endif
 #if defined(PARSEC_PROF_GRAPHER)
-      if(NULL != parsec_ttg_caller && !parsec_ttg_caller->dummy()) {
-        int orig_index = detail::find_index_of_copy_in_task(parsec_ttg_caller, &value);
+      if(NULL != detail::parsec_ttg_caller && !detail::parsec_ttg_caller->dummy()) {
+        int orig_index = detail::find_index_of_copy_in_task(detail::parsec_ttg_caller, &value);
         char orig_str[32];
         char dest_str[32];
         if(orig_index >= 0) {
@@ -2083,7 +2367,7 @@ namespace ttg_parsec {
         parsec_flow_t dest{ .name = dest_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW,
                             .flow_index = 0, .flow_datatype_mask = ~0 };
         task_t *task = create_new_task(key);
-        parsec_prof_grapher_dep(&parsec_ttg_caller->parsec_task, &task->parsec_task, 0, &orig, &dest);
+        parsec_prof_grapher_dep(&detail::parsec_ttg_caller->parsec_task, &task->parsec_task, 0, &orig, &dest);
         delete task;
       }
 #endif
@@ -2098,8 +2382,8 @@ namespace ttg_parsec {
 #endif
       parsec_task_t *task_ring = nullptr;
       detail::ttg_data_copy_t *copy = nullptr;
-      if (nullptr != parsec_ttg_caller) {
-        copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
+      if (nullptr != detail::parsec_ttg_caller) {
+        copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
       }
 
       for (auto it = begin; it != end; ++it) {
@@ -2117,84 +2401,13 @@ namespace ttg_parsec {
     }
 
     template <std::size_t i, typename Key, typename Value>
-    std::enable_if_t<!ttg::meta::is_void_v<Key> && !std::is_void_v<std::decay_t<Value>> &&
-                         !ttg::has_split_metadata<std::decay_t<Value>>::value,
+    std::enable_if_t<!ttg::meta::is_void_v<Key> && !std::is_void_v<std::decay_t<Value>>,
                      void>
     broadcast_arg(const ttg::span<const Key> &keylist, const Value &value) {
-      auto world = ttg_default_execution_context();
-      int rank = world.rank();
-
-      bool have_remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(),
-                                                       [&](const Key &key) { return keymap(key) != rank; });
-
-      if (have_remote) {
-        std::vector<Key> keylist_sorted(keylist.begin(), keylist.end());
-
-        /* Assuming there are no local keys, will be updated while processing remote keys */
-        auto local_begin = keylist_sorted.end();
-        auto local_end = keylist_sorted.end();
-
-        /* sort the input key list by owner and check whether there are remote keys */
-        std::sort(keylist_sorted.begin(), keylist_sorted.end(), [&](const Key &a, const Key &b) mutable {
-          int rank_a = keymap(a);
-          int rank_b = keymap(b);
-          return rank_a < rank_b;
-        });
-
-        using msg_t = detail::msg_t;
-        local_begin = keylist_sorted.end();
-        auto &world_impl = world.impl();
-        std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_SET_ARG, i);
-
-        parsec_taskpool_t *tp = world_impl.taskpool();
-
-        for (auto it = keylist_sorted.begin(); it < keylist_sorted.end(); /* increment inline */) {
-          auto owner = keymap(*it);
-          if (owner == rank) {
-            /* make sure we don't lose local keys */
-            local_begin = it;
-            local_end =
-                std::find_if_not(++it, keylist_sorted.end(), [&](const Key &key) { return keymap(key) == rank; });
-            it = local_end;
-            continue;
-          }
-
-          /* pack all keys for this owner */
-          int num_keys = 0;
-          uint64_t pos = 0;
-          do {
-            ++num_keys;
-            pos = pack(*it, msg->bytes, pos);
-            ++it;
-          } while (it < keylist_sorted.end() && keymap(*it) == owner);
-          msg->tt_id.num_keys = num_keys;
-
-          /* TODO: use RMA to transfer the value */
-          pos = pack(value, msg->bytes, pos);
-
-          /* Send the message */
-          tp->tdm.module->outgoing_message_start(tp, owner, NULL);
-          tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0);
-          parsec_ce.send_am(&parsec_ce, world_impl.parsec_ttg_tag(), owner, static_cast<void *>(msg.get()),
-                            sizeof(msg_header_t) + pos);
-        }
-        /* handle local keys */
-        broadcast_arg_local<i>(local_begin, local_end, value);
-      } else {
-        /* only local keys */
-        broadcast_arg_local<i>(keylist.begin(), keylist.end(), value);
-      }
-    }
-
-    template <std::size_t i, typename Key, typename Value>
-    std::enable_if_t<!ttg::meta::is_void_v<Key> && !std::is_void_v<std::decay_t<Value>> &&
-                         ttg::has_split_metadata<std::decay_t<Value>>::value,
-                     void>
-    splitmd_broadcast_arg(const ttg::span<const Key> &keylist, const Value &value) {
       using valueT = std::tuple_element_t<i, input_values_full_tuple_type>;
       auto world = ttg_default_execution_context();
       int rank = world.rank();
+      uint64_t pos = 0;
       bool have_remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(),
                                                        [&](const Key &key) { return keymap(key) != rank; });
 
@@ -2213,41 +2426,69 @@ namespace ttg_parsec {
         auto local_begin = keylist_sorted.end();
         auto local_end = keylist_sorted.end();
 
-        ttg::SplitMetadataDescriptor<decvalueT> descr;
-        auto iovs = descr.get_data(*const_cast<decvalueT *>(&value));
-        int32_t num_iovs = std::distance(std::begin(iovs), std::end(iovs));
+        int32_t num_iovs = 0;
+
+        detail::ttg_data_copy_t *copy;
+        copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
+        assert(nullptr != copy);
+
         std::vector<std::pair<int32_t, std::shared_ptr<void>>> memregs;
-        memregs.reserve(num_iovs);
-
-        /* register all iovs so the registration can be reused */
-        for (auto &&iov : iovs) {
-          parsec_ce_mem_reg_handle_t lreg;
-          size_t lreg_size;
-          parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t,
-                                 iov.num_bytes, &lreg, &lreg_size);
-          /* TODO: use a static function for deregistration here? */
-          memregs.push_back(std::make_pair(static_cast<int32_t>(lreg_size),
-                                           /* TODO: this assumes that parsec_ce_mem_reg_handle_t is void* */
-                                           std::shared_ptr<void>{lreg, [](void *ptr) {
-                                                                   parsec_ce_mem_reg_handle_t memreg =
-                                                                       (parsec_ce_mem_reg_handle_t)ptr;
-                                                                   parsec_ce.mem_unregister(&memreg);
-                                                                 }}));
-        }
+        auto register_iovs_fn = [&memregs](auto&& iovs){
+          for (auto &&iov : iovs) {
+            parsec_ce_mem_reg_handle_t lreg;
+            size_t lreg_size;
+            parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t,
+                                  iov.num_bytes, &lreg, &lreg_size);
+            /* TODO: use a static function for deregistration here? */
+            memregs.push_back(std::make_pair(static_cast<int32_t>(lreg_size),
+                                            /* TODO: this assumes that parsec_ce_mem_reg_handle_t is void* */
+                                            std::shared_ptr<void>{lreg, [](void *ptr) {
+                                                                    parsec_ce_mem_reg_handle_t memreg =
+                                                                        (parsec_ce_mem_reg_handle_t)ptr;
+                                                                    std::cout << "broadcast_arg memunreg lreg " << memreg << std::endl;
+                                                                    parsec_ce.mem_unregister(&memreg);
+                                                                  }}));
+            std::cout << "broadcast_arg memreg lreg " << lreg << std::endl;
+          }
+        };
 
         using msg_t = detail::msg_t;
         auto &world_impl = world.impl();
         std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_SET_ARG, i);
-        auto metadata = descr.get_metadata(value);
-        size_t metadata_size = sizeof(metadata);
+                                                             msg_header_t::MSG_SET_ARG, i, world_impl.rank());
 
-        detail::ttg_data_copy_t *copy;
-        copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
-        assert(nullptr != copy);
+        if constexpr (ttg::has_split_metadata<std::decay_t<Value>>::value) {
+          ttg::SplitMetadataDescriptor<decvalueT> descr;
+          auto iovs = descr.get_data(*const_cast<decvalueT *>(&value));
+          num_iovs = std::distance(std::begin(iovs), std::end(iovs));
+          memregs.reserve(num_iovs);
+          register_iovs_fn(iovs);
+          /* pack the metadata */
+          auto metadata = descr.get_metadata(value);
+          size_t metadata_size = sizeof(metadata);
+          pos = pack(metadata, msg->bytes, pos);
+        } else if constexpr (!ttg::has_split_metadata<std::decay_t<Value>>::value) {
+          /* serialize the object once */
+          pos = pack(value, msg->bytes, pos, copy);
+          num_iovs = std::distance(copy->iovec_begin(), copy->iovec_end());
+          register_iovs_fn(copy->iovec_span());
+          copy->iovec_reset();
+        }
+
+        /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a
+          * raw function pointer instead of a preregistered AM tag, so play that game.
+          * Once this is fixed in PaRSEC we need to use parsec_ttg_rma_tag instead! */
+        parsec_ce_tag_t cbtag = reinterpret_cast<parsec_ce_tag_t>(&detail::get_remote_complete_cb);
+        std::memcpy(msg->bytes + pos, &cbtag, sizeof(cbtag));
+        pos += sizeof(cbtag);
+
+        msg->tt_id.num_iovecs = num_iovs;
+
+        std::size_t save_pos = pos;
 
         parsec_taskpool_t *tp = world_impl.taskpool();
         for (auto it = keylist_sorted.begin(); it < keylist_sorted.end(); /* increment done inline */) {
+
           auto owner = keymap(*it);
           if (owner == rank) {
             local_begin = it;
@@ -2258,41 +2499,14 @@ namespace ttg_parsec {
             continue;
           }
 
-          /* count keys and set it afterwards */
-          uint64_t pos = 0;
-          /* pack all keys for this owner */
-          int num_keys = 0;
-          do {
-            ++num_keys;
-            pos = pack(*it, msg->bytes, pos);
-            ++it;
-          } while (it < keylist_sorted.end() && keymap(*it) == owner);
-          msg->tt_id.num_keys = num_keys;
-
-          /* pack the metadata */
-          std::memcpy(msg->bytes + pos, &metadata, metadata_size);
-          pos += metadata_size;
-          /* pack the local rank */
-          int rank = world.rank();
-          std::memcpy(msg->bytes + pos, &rank, sizeof(rank));
-          pos += sizeof(rank);
-          /* pack the number of iovecs */
-          std::memcpy(msg->bytes + pos, &num_iovs, sizeof(num_iovs));
-          pos += sizeof(num_iovs);
-
-          /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a
-           * raw function pointer instead of a preregistered AM tag, so play that game.
-           * Once this is fixed in PaRSEC we need to use parsec_ttg_rma_tag instead! */
-          parsec_ce_tag_t cbtag = reinterpret_cast<parsec_ce_tag_t>(&detail::get_remote_complete_cb);
-          std::memcpy(msg->bytes + pos, &cbtag, sizeof(cbtag));
-          pos += sizeof(cbtag);
-
+          /* rewind the buffer and start packing a new set of memregs and keys */
+          pos = save_pos;
           /**
            * pack the registration handles
            * memory layout: [<lreg_size, lreg, lreg_fn>, ...]
+           * NOTE: we need to pack these for every receiver to ensure correct ref-counting of the registration
            */
-          int idx = 0;
-          for (auto &&iov : iovs) {
+          for (int idx = 0; idx < num_iovs; ++idx) {
             // auto [lreg_size, lreg_ptr] = memregs[idx];
             int32_t lreg_size;
             std::shared_ptr<void> lreg_ptr;
@@ -2301,23 +2515,36 @@ namespace ttg_parsec {
             pos += sizeof(lreg_size);
             std::memcpy(msg->bytes + pos, lreg_ptr.get(), lreg_size);
             pos += lreg_size;
-            /* create a function that will be invoked upon RMA completion at the target */
-            std::shared_ptr<void> lreg_ptr_v = lreg_ptr;
+            std::cout << "broadcast_arg lreg_ptr " << lreg_ptr.get() << std::endl;
             /* mark another reader on the copy */
             copy = detail::register_data_copy<valueT>(copy, nullptr, true);
+            /* create a function that will be invoked upon RMA completion at the target */
             std::function<void(void)> *fn = new std::function<void(void)>([=]() mutable {
               /* shared_ptr of value and registration captured by value so resetting
-               * them here will eventually release the memory/registration */
+                * them here will eventually release the memory/registration */
               detail::release_data_copy(copy);
-              lreg_ptr_v.reset();
+              lreg_ptr.reset();
             });
             std::intptr_t fn_ptr{reinterpret_cast<std::intptr_t>(fn)};
             std::memcpy(msg->bytes + pos, &fn_ptr, sizeof(fn_ptr));
             pos += sizeof(fn_ptr);
-            ++idx;
           }
+
+          /* mark the beginning of the keys */
+          msg->tt_id.key_offset = pos;
+
+          /* pack all keys for this owner */
+          int num_keys = 0;
+          do {
+            ++num_keys;
+            pos = pack(*it, msg->bytes, pos);
+            ++it;
+          } while (it < keylist_sorted.end() && keymap(*it) == owner);
+          msg->tt_id.num_keys = num_keys;
+
           tp->tdm.module->outgoing_message_start(tp, owner, NULL);
           tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0);
+          std::cout << "broadcast_arg send_am owner " << owner << std::endl;
           parsec_ce.send_am(&parsec_ce, world_impl.parsec_ttg_tag(), owner, static_cast<void *>(msg.get()),
                             sizeof(msg_header_t) + pos);
         }
@@ -2406,10 +2633,10 @@ namespace ttg_parsec {
         auto &world_impl = world.impl();
         uint64_t pos = 0;
         std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_SET_ARGSTREAM_SIZE, i, 1);
+                                                             msg_header_t::MSG_SET_ARGSTREAM_SIZE, i,
+                                                             world_impl.rank(), 1);
         /* pack the key */
         pos = pack(key, msg->bytes, pos);
-        msg->tt_id.num_keys = 1;
         pos = pack(size, msg->bytes, pos);
         parsec_taskpool_t *tp = world_impl.taskpool();
         tp->tdm.module->outgoing_message_start(tp, owner, NULL);
@@ -2461,9 +2688,8 @@ namespace ttg_parsec {
         auto &world_impl = world.impl();
         uint64_t pos = 0;
         std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_SET_ARGSTREAM_SIZE, i, 1);
-        /* pack the key */
-        msg->tt_id.num_keys = 0;
+                                                             msg_header_t::MSG_SET_ARGSTREAM_SIZE, i,
+                                                             world_impl.rank(), 0);
         pos = pack(size, msg->bytes, pos);
         parsec_taskpool_t *tp = world_impl.taskpool();
         tp->tdm.module->outgoing_message_start(tp, owner, NULL);
@@ -2513,10 +2739,10 @@ namespace ttg_parsec {
         auto &world_impl = world.impl();
         uint64_t pos = 0;
         std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i, 1);
+                                                             msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i,
+                                                             world_impl.rank(), 1);
         /* pack the key */
         pos = pack(key, msg->bytes, pos);
-        msg->tt_id.num_keys = 1;
         parsec_taskpool_t *tp = world_impl.taskpool();
         tp->tdm.module->outgoing_message_start(tp, owner, NULL);
         tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0);
@@ -2559,8 +2785,8 @@ namespace ttg_parsec {
         auto &world_impl = world.impl();
         uint64_t pos = 0;
         std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i, 1);
-        msg->tt_id.num_keys = 0;
+                                                             msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i,
+                                                             world_impl.rank(), 0);
         parsec_taskpool_t *tp = world_impl.taskpool();
         tp->tdm.module->outgoing_message_start(tp, owner, NULL);
         tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0);
@@ -2589,6 +2815,84 @@ namespace ttg_parsec {
       }
     }
 
+    void copy_mark_pushout(detail::ttg_data_copy_t *copy) {
+
+      assert(detail::parsec_ttg_caller->dev_ptr && detail::parsec_ttg_caller->dev_ptr->gpu_task);
+      parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task;
+      for (auto data : *copy) {
+        if (data->owner_device != 0 &&
+            data->device_copies[0]->version < data->device_copies[data->owner_device]->version) {
+          /* find the flow */
+          int flowidx = 0;
+          while (flowidx < MAX_PARAM_COUNT &&
+                gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) {
+            if (detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in->original == data) {
+              /* found the right data, set the corresponding flow as pushout */
+              break;
+            }
+            ++flowidx;
+          }
+          if (flowidx == MAX_PARAM_COUNT) {
+            throw std::runtime_error("Cannot add more than MAX_PARAM_COUNT flows to a task!");
+          }
+          if (gpu_task->flow[flowidx]->flow_flags == PARSEC_FLOW_ACCESS_NONE) {
+            /* no flow found, add one and mark it pushout */
+            detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in = data->device_copies[0];
+            ((parsec_flow_t *)gpu_task->flow[flowidx])->flow_flags |= PARSEC_FLOW_ACCESS_RW;
+            gpu_task->flow_nb_elts[flowidx] = data->nb_elts;
+          }
+          gpu_task->pushout |= 1<<flowidx;
+        }
+      }
+    }
+
+    template <std::size_t i, typename Key, typename Value>
+    std::enable_if_t<!ttg::meta::is_void_v<Key> && !std::is_void_v<std::decay_t<Value>>,
+                     void>
+    prepare_send(const ttg::span<const Key> &keylist, const Value &value) {
+      using valueT = std::tuple_element_t<i, input_values_full_tuple_type>;
+
+      /* get the copy */
+      detail::ttg_data_copy_t *copy;
+      copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
+      assert(nullptr != copy);
+
+      /* check if there are non-local successors if it's a cuda task */
+      bool have_remote = false;
+      if constexpr (derived_has_cuda_op()) {
+        auto world = ttg_default_execution_context();
+        int rank = world.rank();
+        uint64_t pos = 0;
+        bool have_remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(),
+                                                        [&](const Key &key) { return keymap(key) != rank; });
+      }
+      if (!derived_has_cuda_op() || have_remote) {
+        /* non-gpu task, check if we need to push back to the host */
+        copy_mark_pushout(copy);
+      }
+    }
+
+    template <std::size_t i, typename Key, typename Value>
+    std::enable_if_t<ttg::meta::is_void_v<Key> && !std::is_void_v<std::decay_t<Value>>,
+                     void>
+    prepare_send(const Value &value) {
+      using valueT = std::tuple_element_t<i, input_values_full_tuple_type>;
+
+      /* get the copy */
+      detail::ttg_data_copy_t *copy;
+      copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
+      assert(nullptr != copy);
+
+      /* check if there are non-local successors if it's a cuda task */
+      auto world = ttg_default_execution_context();
+      int rank = world.rank();
+      bool have_remote = (keymap() != rank);
+      if (!derived_has_cuda_op() || have_remote) {
+        /* non-gpu task, check if we need to push back to the host */
+        copy_mark_pushout(copy);
+      }
+    }
+
    private:
     // Copy/assign/move forbidden ... we could make it work using
     // PIMPL for this base class.  However, this instance of the base
@@ -2622,15 +2926,15 @@ namespace ttg_parsec {
           set_arg<i, keyT, const valueT &>(key, value);
         };
         auto broadcast_callback = [this](const ttg::span<const keyT> &keylist, const valueT &value) {
-          if constexpr (ttg::has_split_metadata<std::decay_t<valueT>>::value) {
-            splitmd_broadcast_arg<i, keyT, valueT>(keylist, value);
-          } else {
             broadcast_arg<i, keyT, valueT>(keylist, value);
-          }
+        };
+        auto prepare_send_callback = [this](const ttg::span<const keyT> &keylist, const valueT &value) {
+            prepare_send<i, keyT, valueT>(keylist, value);
         };
         auto setsize_callback = [this](const keyT &key, std::size_t size) { set_argstream_size<i>(key, size); };
         auto finalize_callback = [this](const keyT &key) { finalize_argstream<i>(key); };
-        input.set_callback(send_callback, move_callback, broadcast_callback, setsize_callback, finalize_callback);
+        input.set_callback(send_callback, move_callback, broadcast_callback,
+                           setsize_callback, finalize_callback, prepare_send_callback);
       }
       //////////////////////////////////////////////////////////////////
       // case 2: nonvoid key, void value, mixed inputs
@@ -2653,7 +2957,10 @@ namespace ttg_parsec {
         auto send_callback = [this](const valueT &value) { set_arg<i, keyT, const valueT &>(value); };
         auto setsize_callback = [this](std::size_t size) { set_argstream_size<i>(size); };
         auto finalize_callback = [this]() { finalize_argstream<i>(); };
-        input.set_callback(send_callback, move_callback, {}, setsize_callback, finalize_callback);
+        auto prepare_send_callback = [this](const valueT &value) {
+            prepare_send<i, void>(value);
+        };
+        input.set_callback(send_callback, move_callback, {}, setsize_callback, finalize_callback, prepare_send_callback);
       }
       //////////////////////////////////////////////////////////////////
       // case 5: void key, void value, mixed inputs
@@ -2669,7 +2976,7 @@ namespace ttg_parsec {
       // NOTE: subsumed in case 5 above, kept for historical reasons
       //////////////////////////////////////////////////////////////////
       else
-        abort();
+        ttg::abort();
     }
 
     template <std::size_t... IS>
@@ -2775,17 +3082,62 @@ namespace ttg_parsec {
       return buffer;
     }
 
+#if defined(PARSEC_PROF_TRACE)
+    static void *parsec_ttg_task_info(void *dst, const void *data, size_t size)
+    {
+      const parsec_task_t *t = reinterpret_cast<const parsec_task_t *>(data);
+
+      if constexpr (ttg::meta::is_void_v<keyT>) {
+        snprintf(reinterpret_cast<char*>(dst), size, "()");
+      } else {
+        // we use the locals array as a scratchpad to store the hash of the key and its actual address
+        // locals[0] amd locals[1] hold the hash, while locals[2] and locals[3] hold the key pointer
+        keyT *key = *(keyT**)&(t->locals[2]);
+        std::stringstream ss;
+        ss << *key;
+
+        std::string keystr = ss.str();
+        snprintf(reinterpret_cast<char*>(dst), size, "%s", keystr.c_str());
+      }
+      return dst;
+    }
+#endif
+
     parsec_key_fn_t tasks_hash_fcts = {key_equal, key_print, key_hash};
 
-    static parsec_hook_return_t complete_task_and_release(parsec_execution_stream_t *es, parsec_task_t *t) {
+    static parsec_hook_return_t complete_task_and_release(parsec_execution_stream_t *es, parsec_task_t *parsec_task) {
       parsec_execution_stream_t *safe_es = parsec_ttg_es;
       parsec_ttg_es = es;
-      auto *task = (detail::parsec_ttg_task_base_t *)t;
+
+      //std::cout << "complete_task_and_release: task " << parsec_task << std::endl;
+
+      task_t *task = (task_t*)parsec_task;
+
+      /* if we still have a coroutine handle we invoke it one more time to get the sends/broadcasts */
+      if (task->suspended_task_address) {
+        // get the device task from the coroutine handle
+        auto dev_task = ttg::device_task_handle_type::from_address(task->suspended_task_address);
+
+        // get the promise which contains the views
+        auto dev_data = dev_task.promise();
+
+        /* for now make sure we're waiting for the kernel to complete and the coro hasn't skipped this step */
+        assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_KERNEL);
+
+        /* the kernel has completed, resume the coroutine once again to get to the send stage */
+        /* TODO: how can we get the execution space here? */
+        static_op<ttg::ExecutionSpace::CUDA>(parsec_task);
+
+        /* the coroutine should have completed and we cannot access the promise anymore */
+        task->suspended_task_address = nullptr;
+      }
+
+      /* release our data copies */
       for (int i = 0; i < task->data_count; i++) {
-        detail::ttg_data_copy_t *copy = static_cast<detail::ttg_data_copy_t *>(task->parsec_task.data[i].data_in);
+        detail::ttg_data_copy_t *copy = task->copies[i];
         if (nullptr == copy) continue;
         detail::release_data_copy(copy);
-        task->parsec_task.data[i].data_in = nullptr;
+        task->copies[i] = nullptr;
       }
       parsec_ttg_es = safe_es;
       return PARSEC_HOOK_RETURN_DONE;
@@ -2828,7 +3180,9 @@ namespace ttg_parsec {
       self.task_class_id = get_instance_id();
       self.nb_parameters = 0;
       self.nb_locals = 0;
-      self.nb_flows = numflows;
+      //self.nb_flows = numflows;
+      self.nb_flows = MAX_PARAM_COUNT; // we're not using all flows but have to
+                                       // trick the device handler into looking at all of them
 
       if( world_impl.profiling() ) {
         // first two ints are used to store the hash of the key.
@@ -2849,9 +3203,14 @@ namespace ttg_parsec {
       self.key_functions = &tasks_hash_fcts;
       self.task_snprintf = parsec_ttg_task_snprintf;
 
+#if defined(PARSEC_PROF_TRACE)
+      self.profile_info = &parsec_ttg_task_info;
+#endif
+
       world_impl.taskpool()->nb_task_classes = std::max(world_impl.taskpool()->nb_task_classes, static_cast<decltype(world_impl.taskpool()->nb_task_classes)>(self.task_class_id+1));
       //    function_id_to_instance[self.task_class_id] = this;
-
+      //self.incarnations = incarnations_array.data();
+//#if 0
       if constexpr (derived_has_cuda_op()) {
         self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
         ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CUDA;
@@ -2872,11 +3231,12 @@ namespace ttg_parsec {
         ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
         ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
       }
+//#endif // 0
 
       self.release_task = &parsec_release_task_to_mempool_update_nbtasks;
       self.complete_execution = complete_task_and_release;
 
-      for (i = 0; i < numins; i++) {
+      for (i = 0; i < MAX_PARAM_COUNT; i++) {
         parsec_flow_t *flow = new parsec_flow_t;
         flow->name = strdup((std::string("flow in") + std::to_string(i)).c_str());
         flow->sym_type = PARSEC_SYM_INOUT;
@@ -2885,13 +3245,13 @@ namespace ttg_parsec {
         flow->dep_in[0] = NULL;
         flow->dep_out[0] = NULL;
         flow->flow_index = i;
-        flow->flow_datatype_mask = (1 << i);
+        flow->flow_datatype_mask = ~0;
         *((parsec_flow_t **)&(self.in[i])) = flow;
       }
-      *((parsec_flow_t **)&(self.in[i])) = NULL;
-      initialize_flows<input_terminals_type>(self.in);
+      //*((parsec_flow_t **)&(self.in[i])) = NULL;
+      //initialize_flows<input_terminals_type>(self.in);
 
-      for (i = 0; i < numouts; i++) {
+      for (i = 0; i < MAX_PARAM_COUNT; i++) {
         parsec_flow_t *flow = new parsec_flow_t;
         flow->name = strdup((std::string("flow out") + std::to_string(i)).c_str());
         flow->sym_type = PARSEC_SYM_INOUT;
@@ -2902,7 +3262,7 @@ namespace ttg_parsec {
         flow->flow_datatype_mask = (1 << i);
         *((parsec_flow_t **)&(self.out[i])) = flow;
       }
-      *((parsec_flow_t **)&(self.out[i])) = NULL;
+      //*((parsec_flow_t **)&(self.out[i])) = NULL;
 
       self.flags = 0;
       self.dependencies_goal = numins; /* (~(uint32_t)0) >> (32 - numins); */
@@ -3085,6 +3445,49 @@ namespace ttg_parsec {
         TTBase::invoke();
     }
 
+  private:
+    template<typename Key, typename Arg, typename... Args, std::size_t I, std::size_t... Is>
+    void invoke_arglist(std::index_sequence<I, Is...>, const Key& key, Arg&& arg, Args&&... args) {
+      using arg_type = std::decay_t<Arg>;
+      if constexpr (ttg::detail::is_ptr_v<arg_type>) {
+        /* add a reference to the object */
+        auto copy = ttg_parsec::detail::get_copy(arg);
+        copy->add_ref();
+        /* reset readers so that the value can flow without copying */
+        copy->reset_readers();
+        auto& val = *arg;
+        set_arg_impl<I>(key, val, copy);
+        ttg_parsec::detail::release_data_copy(copy);
+        if constexpr (std::is_rvalue_reference_v<Arg>) {
+          /* if the ptr was moved in we reset it */
+          arg.reset();
+        }
+      } else if constexpr (!ttg::detail::is_ptr_v<arg_type>) {
+        set_arg<I>(key, std::forward<Arg>(arg));
+      }
+      if constexpr (sizeof...(Is) > 0) {
+        /* recursive next argument */
+        invoke_arglist(std::index_sequence<Is...>{}, key, std::forward<Args>(args)...);
+      }
+    }
+
+  public:
+    // Manual injection of a task with all input arguments specified as variadic arguments
+    template <typename Key = keyT, typename Arg, typename... Args>
+    std::enable_if_t<!ttg::meta::is_void_v<Key> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>, void> invoke(
+        const Key &key, Arg&& arg, Args&&... args) {
+      static_assert(sizeof...(Args)+1 == std::tuple_size_v<actual_input_tuple_type>,
+                    "Number of arguments to invoke must match the number of task inputs.");
+      TTG_OP_ASSERT_EXECUTABLE();
+      /* trigger non-void inputs */
+      invoke_arglist(ttg::meta::nonvoid_index_seq<actual_input_tuple_type>{}, key,
+                     std::forward<Arg>(arg), std::forward<Args>(args)...);
+      //set_args(ttg::meta::nonvoid_index_seq<actual_input_tuple_type>{}, key, args);
+      /* trigger void inputs */
+      using void_index_seq = ttg::meta::void_index_seq<actual_input_tuple_type>;
+      set_args(void_index_seq{}, key, ttg::detail::make_void_tuple<void_index_seq::size()>());
+    }
+
     void set_defer_writer(bool value) {
       m_defer_writer = value;
     }
@@ -3164,6 +3567,71 @@ namespace ttg_parsec {
 
 #include "ttg/make_tt.h"
 
+  namespace device {
+
+    class DeviceAllocator {
+      private:
+          int ttg_did, parsec_did;
+          struct ::zone_malloc_s *zone;
+          ::ttg::ExecutionSpace exec_space;
+      public:
+        DeviceAllocator(int did);
+        void *allocate(std::size_t size);
+        void  free(void *ptr);
+        ::ttg::ExecutionSpace executionSpace();
+    };
+
+    DeviceAllocator::DeviceAllocator(int did) : ttg_did(-1), parsec_did(-1), zone(nullptr), exec_space(::ttg::ExecutionSpace::Invalid) {
+      for(int i = 0; i < parsec_nb_devices; i++) {
+        parsec_device_module_t *m = parsec_mca_device_get(i);
+        if(m->type == PARSEC_DEV_CPU || m->type == PARSEC_DEV_CUDA) { 
+          if(did == 0) {
+            parsec_did = i;
+            ttg_did = did;
+            if(m->type == PARSEC_DEV_CUDA) {
+              parsec_device_gpu_module_t *gm = reinterpret_cast<parsec_device_gpu_module_t*>(m);
+              zone = gm->memory;
+              exec_space = ::ttg::ExecutionSpace::CUDA;
+            } else {
+              exec_space = ::ttg::ExecutionSpace::Host;
+            }
+            return;
+          }
+          did--;
+        }
+      }
+      throw std::out_of_range("Device identifier is out of range");
+    }
+
+    void *DeviceAllocator::allocate(std::size_t size) {
+      if(nullptr == zone) return malloc(size);
+      return zone_malloc(zone, size);
+    }
+
+    void DeviceAllocator::free(void *ptr) {
+      if(nullptr == zone) {
+        free(ptr);
+        return;
+      }
+      zone_free(zone, ptr);
+    }
+
+    ::ttg::ExecutionSpace DeviceAllocator::executionSpace() {
+      return exec_space;
+    }
+
+    std::size_t nb_devices() { 
+      std::size_t nb = 0;
+      for(int i = 0; i < parsec_nb_devices; i++) {
+        parsec_device_module_t *m = parsec_mca_device_get(i);
+        if(m->type == PARSEC_DEV_CPU || m->type == PARSEC_DEV_CUDA) {
+          nb++;
+        }
+      }
+      return nb;
+    }
+  } // namespace ttg_parsec::device
+
 }  // namespace ttg_parsec
 
 /**
@@ -3177,30 +3645,49 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
   ttg_parsec::detail::ttg_data_copy_t *copy_to_remove = nullptr;
 
  public:
+  value_copy_handler() = default;
+  value_copy_handler(const value_copy_handler& h) = delete;
+  value_copy_handler(value_copy_handler&& h)
+  : copy_to_remove(h.copy_to_remove)
+  {
+    h.copy_to_remove = nullptr;
+  }
+
+  value_copy_handler& operator=(const value_copy_handler& h) = delete;
+  value_copy_handler& operator=(value_copy_handler&& h)
+  {
+    std::swap(copy_to_remove, h.copy_to_remove);
+    return *this;
+  }
+
   ~value_copy_handler() {
     if (nullptr != copy_to_remove) {
-      ttg_parsec::detail::remove_data_copy(copy_to_remove, parsec_ttg_caller);
+      ttg_parsec::detail::remove_data_copy(copy_to_remove, ttg_parsec::detail::parsec_ttg_caller);
       ttg_parsec::detail::release_data_copy(copy_to_remove);
     }
   }
 
   template <typename Value>
   inline Value &&operator()(Value &&value) {
-    if (nullptr == parsec_ttg_caller) {
+    static_assert(std::is_rvalue_reference_v<decltype(value)> ||
+                  std::is_copy_constructible_v<std::decay_t<Value>>,
+                  "Data sent without being moved must be copy-constructible!");
+    if (nullptr == ttg_parsec::detail::parsec_ttg_caller) {
       ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n");
     }
+    using value_type = std::remove_reference_t<Value>;
     ttg_parsec::detail::ttg_data_copy_t *copy;
-    copy = ttg_parsec::detail::find_copy_in_task(parsec_ttg_caller, &value);
-    Value *value_ptr = &value;
+    copy = ttg_parsec::detail::find_copy_in_task(ttg_parsec::detail::parsec_ttg_caller, &value);
+    value_type *value_ptr = &value;
     if (nullptr == copy) {
       /**
        * the value is not known, create a copy that we can track
        * depending on Value, this uses either the copy or move constructor
        */
       copy = ttg_parsec::detail::create_new_datacopy(std::forward<Value>(value));
-      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller);
+      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, ttg_parsec::detail::parsec_ttg_caller);
       assert(inserted);
-      value_ptr = reinterpret_cast<Value *>(copy->device_private);
+      value_ptr = reinterpret_cast<value_type *>(copy->get_ptr());
       copy_to_remove = copy;
     } else {
       /* this copy won't be modified anymore so mark it as read-only */
@@ -3211,11 +3698,13 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
 
   template <typename Value>
   inline const Value &operator()(const Value &value) {
-    if (nullptr == parsec_ttg_caller) {
+    static_assert(std::is_copy_constructible_v<std::decay_t<Value>>,
+                  "Data sent without being moved must be copy-constructible!");
+    if (nullptr == ttg_parsec::detail::parsec_ttg_caller) {
       ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n");
     }
     ttg_parsec::detail::ttg_data_copy_t *copy;
-    copy = ttg_parsec::detail::find_copy_in_task(parsec_ttg_caller, &value);
+    copy = ttg_parsec::detail::find_copy_in_task(ttg_parsec::detail::parsec_ttg_caller, &value);
     const Value *value_ptr = &value;
     if (nullptr == copy) {
       /**
@@ -3223,30 +3712,14 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
        * depending on Value, this uses either the copy or move constructor
        */
       copy = ttg_parsec::detail::create_new_datacopy(value);
-      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller);
+      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, ttg_parsec::detail::parsec_ttg_caller);
       assert(inserted);
-      value_ptr = reinterpret_cast<Value *>(copy->device_private);
+      value_ptr = reinterpret_cast<Value *>(copy->get_ptr());
       copy_to_remove = copy;
     }
     return *value_ptr;
   }
 
-  /* we have to make a copy of non-const data as the user may modify it after
-   * send/broadcast */
-  template <typename Value, typename Enabler = std::enable_if_t<!std::is_const_v<Value>>>
-  inline Value &operator()(Value &value) {
-    if (nullptr == parsec_ttg_caller) {
-      ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n");
-    }
-    /* the value is not known, create a copy that we can track */
-    ttg_parsec::detail::ttg_data_copy_t *copy;
-    copy = ttg_parsec::detail::create_new_datacopy(value);
-    bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller);
-    assert(inserted);
-    Value *value_ptr = reinterpret_cast<Value *>(copy->device_private);
-    copy_to_remove = copy;
-    return *value_ptr;
-  }
 };
 
 #endif  // PARSEC_TTG_H_INCLUDED
diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h
index 461984e3d2..e97184b287 100644
--- a/ttg/ttg/parsec/ttg_data_copy.h
+++ b/ttg/ttg/parsec/ttg_data_copy.h
@@ -3,36 +3,108 @@
 
 #include <utility>
 #include <limits>
+#include <vector>
+#include <iterator>
+#include <atomic>
+#include <type_traits>
 
 #include <parsec.h>
 
+#include "ttg/parsec/thread_local.h"
+#include "ttg/util/span.h"
+
 
 namespace ttg_parsec {
 
   namespace detail {
 
-    /* Extension of PaRSEC's data copy. Note that we use the readers field
-    * to facilitate the ref-counting of the data copy.
-    * TODO: create abstractions for all fields in parsec_data_copy_t that we access.
-    */
-    struct ttg_data_copy_t : public parsec_data_copy_t {
-#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
-      int64_t size;
-      int64_t uid;
-#endif
+    template<typename T>
+    struct ttg_data_copy_container_setter {
+      ttg_data_copy_container_setter(T* ptr) {
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = ptr;
+      }
+    };
+
+    /* Non-owning copy-tracking wrapper, accounting for N readers or 1 writer.
+     * Also counts external references, which are not treated as
+     * readers or writers but merely prevent the object from being
+     * destroyed once no readers/writers exist.
+     */
+    struct ttg_data_copy_t : private ttg_data_copy_container_setter<ttg_data_copy_t> {
 
       /* special value assigned to parsec_data_copy_t::readers to mark the copy as
       * mutable, i.e., a task will modify it */
       static constexpr int mutable_tag = std::numeric_limits<int>::min();
 
+      ttg_data_copy_t()
+      : ttg_data_copy_container_setter(this)
+      { }
+
+      ttg_data_copy_t(const ttg_data_copy_t& c)
+      : ttg_data_copy_container_setter(this)
+      {
+        /* we allow copying but do not copy any data over from the original
+         * device copies will have to be allocated again
+         * and it's a new object to reference */
+      }
+
+      ttg_data_copy_t(ttg_data_copy_t&& c)
+      : ttg_data_copy_container_setter(this)
+      , m_ptr(c.m_ptr)
+      , m_next_task(c.m_next_task)
+      , m_readers(c.m_readers)
+      , m_refs(c.m_refs.load(std::memory_order_relaxed))
+      , m_dev_data(std::move(c.m_dev_data))
+      , m_single_dev_data(c.m_single_dev_data)
+      , m_num_dev_data(c.m_num_dev_data)
+      {
+        c.m_num_dev_data = 0;
+        c.m_readers = 0;
+        c.m_single_dev_data = nullptr;
+      }
+
+      ttg_data_copy_t& operator=(ttg_data_copy_t&& c)
+      {
+        m_ptr = c.m_ptr;
+        c.m_ptr = nullptr;
+        m_next_task = c.m_next_task;
+        c.m_next_task = nullptr;
+        m_readers = c.m_readers;
+        c.m_readers = 0;
+        m_refs.store(c.m_refs.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        c.m_refs.store(0, std::memory_order_relaxed);
+        m_dev_data = std::move(c.m_dev_data);
+        m_single_dev_data = c.m_single_dev_data;
+        c.m_single_dev_data = nullptr;
+        m_num_dev_data = c.m_num_dev_data;
+        c.m_num_dev_data = 0;
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = this;
+        return *this;
+      }
+
+      ttg_data_copy_t& operator=(const ttg_data_copy_t& c) {
+        /* we allow copying but do not copy any data over from the original
+         * device copies will have to be allocated again
+         * and it's a new object to reference */
+
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = this;
+        return *this;
+      }
+
+      /* mark destructor as virtual */
+      virtual ~ttg_data_copy_t() = default;
+
       /* Returns true if the copy is mutable */
       bool is_mutable() const {
-        return this->readers == mutable_tag;
+        return m_readers == mutable_tag;
       }
 
       /* Mark the copy as mutable */
       void mark_mutable() {
-        this->readers = mutable_tag;
+        m_readers = mutable_tag;
       }
 
       /* Increment the reader counter and return previous value
@@ -41,9 +113,11 @@ namespace ttg_parsec {
       template<bool Atomic = true>
       int increment_readers() {
         if constexpr(Atomic) {
-          return parsec_atomic_fetch_inc_int32(&this->readers);
+          //return parsec_atomic_fetch_inc_int32(&m_readers);
+          std::atomic_ref<int32_t> a{m_readers};
+          return a.fetch_add(1, std::memory_order_relaxed);
         } else {
-          return this->readers++;
+          return m_readers++;
         }
       }
 
@@ -51,7 +125,7 @@ namespace ttg_parsec {
       * Reset the number of readers to read-only with a single reader.
       */
       void reset_readers() {
-        this->readers = 1;
+        m_readers = 1;
       }
 
       /* Decrement the reader counter and return previous value.
@@ -60,28 +134,150 @@ namespace ttg_parsec {
       template<bool Atomic = true>
       int decrement_readers() {
         if constexpr(Atomic) {
-          return parsec_atomic_fetch_dec_int32(&this->readers);
+          //return parsec_atomic_fetch_dec_int32(&m_readers);
+          std::atomic_ref<int32_t> a{m_readers};
+          return a.fetch_sub(1, std::memory_order_relaxed);
         } else {
-          return this->readers--;
+          return m_readers--;
         }
       }
 
       /* Returns the number of readers if the copy is immutable, or \c mutable_tag
       * if the copy is mutable */
       int num_readers() const {
-        return this->readers;
+        return m_readers;
       }
 
-      ttg_data_copy_t()
-      {
-        /* TODO: do we need this construction? */
-        PARSEC_OBJ_CONSTRUCT(this, parsec_data_copy_t);
-        this->readers = 1;
-        this->push_task = nullptr;
+      void *get_ptr() const {
+        return m_ptr;
       }
 
-      /* mark destructor as virtual */
-      virtual ~ttg_data_copy_t() = default;
+      parsec_task_t* get_next_task() const {
+        return m_next_task;
+      }
+
+      void set_next_task(parsec_task_t* task) {
+        m_next_task = task;
+      }
+
+      int32_t add_ref() {
+        return m_refs.fetch_add(1, std::memory_order_relaxed);
+      }
+
+      int32_t drop_ref() {
+        return m_refs.fetch_sub(1, std::memory_order_relaxed);
+      }
+
+      bool has_ref() {
+        return (m_refs.load(std::memory_order_relaxed) != 0);
+      }
+
+      int32_t num_ref() const {
+        return m_refs.load(std::memory_order_relaxed);
+      }
+
+      /* manage device copies owned by this object
+       * we only touch the vector if we have more than one copies to track
+       * and otherwise use the single-element member.
+       */
+      using iterator = parsec_data_t**;
+
+      void add_device_data(parsec_data_t* data) {
+        // TODO: properly release again!
+        PARSEC_OBJ_RETAIN(data);
+        switch (m_num_dev_data) {
+          case 0:
+            m_single_dev_data = data;
+            break;
+          case 1:
+            /* move single copy into vector and add new copy below */
+            m_dev_data.push_back(m_single_dev_data);
+            /* fall-through */
+          default:
+            /* store in multi-copy vector */
+            m_dev_data.push_back(data);
+            break;
+        }
+        m_num_dev_data++;
+      }
+
+      void remove_device_data(parsec_data_t* data) {
+        if (m_num_dev_data == 1) {
+          m_single_dev_data = nullptr;
+        } else if (m_num_dev_data > 1) {
+          auto it = std::find(m_dev_data.begin(), m_dev_data.end(), data);
+          if (it != m_dev_data.end()) {
+            m_dev_data.erase(it);
+          }
+        }
+        --m_num_dev_data;
+      }
+
+      int num_dev_data() const {
+        return m_num_dev_data;
+      }
+
+      iterator begin() {
+        switch(m_num_dev_data) {
+          // no device copies
+          case 0: return end();
+          case 1: return &m_single_dev_data;
+          default: return m_dev_data.data();
+        }
+      }
+
+      iterator end() {
+        switch(m_num_dev_data) {
+          case 0:
+          case 1:
+            return &(m_single_dev_data) + 1;
+          default:
+            return m_dev_data.data() + m_dev_data.size();
+        }
+      }
+
+      using iovec_iterator = typename std::vector<ttg::iovec>::iterator;
+
+      iovec_iterator iovec_begin() {
+        return m_iovecs.begin();
+      }
+
+      iovec_iterator iovec_end() {
+        return m_iovecs.end();
+      }
+
+      void iovec_reset() {
+        m_iovecs.clear();
+      }
+
+      void iovec_add(const ttg::iovec& iov) {
+        m_iovecs.push_back(iov);
+      }
+
+      ttg::span<ttg::iovec> iovec_span() {
+        return ttg::span<ttg::iovec>(m_iovecs.data(), m_iovecs.size());
+      }
+
+      std::size_t iovec_count() const {
+        return m_iovecs.size();
+      }
+
+#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
+      int64_t size;
+      int64_t uid;
+#endif
+    protected:
+      void          *m_ptr;
+      parsec_task_t *m_next_task = nullptr;
+      int32_t        m_readers  = 1;
+      std::atomic<int32_t>  m_refs = 1; // number of entities referencing this copy (TTGs, external)
+
+      std::vector<ttg::iovec> m_iovecs;
+
+      std::vector<parsec_data_t*> m_dev_data;   //< used if there are multiple device copies
+                                                  //  that belong to this object
+      parsec_data_t *m_single_dev_data;           //< used if there is a single device copy
+      int m_num_dev_data = 0;                   //< number of device copies
     };
 
 
@@ -92,14 +288,57 @@ namespace ttg_parsec {
     */
     template<typename ValueT>
     struct ttg_data_value_copy_t final : public ttg_data_copy_t {
-      using value_type = std::decay_t<ValueT>;
+      using value_type = ValueT;
       value_type m_value;
 
       template<typename T>
       ttg_data_value_copy_t(T&& value)
-      : ttg_data_copy_t(), m_value(std::forward<T>(value))
+      : ttg_data_copy_t()
+      , m_value(std::forward<T>(value))
+      {
+        this->m_ptr = const_cast<value_type*>(&m_value);
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t(ttg_data_value_copy_t&& c)
+        noexcept(std::is_nothrow_move_constructible_v<value_type>)
+      : ttg_data_copy_t(std::move(c))
+      , m_value(std::move(c.m_value))
       {
-        this->device_private = const_cast<value_type*>(&m_value);
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t(const ttg_data_value_copy_t& c)
+        noexcept(std::is_nothrow_copy_constructible_v<value_type>)
+      : ttg_data_copy_t(c)
+      , m_value(c.m_value)
+      {
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t& operator=(ttg_data_value_copy_t&& c)
+        noexcept(std::is_nothrow_move_assignable_v<value_type>)
+      {
+        ttg_data_copy_t::operator=(std::move(c));
+        m_value = std::move(c.m_value);
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t& operator=(const ttg_data_value_copy_t& c)
+        noexcept(std::is_nothrow_copy_assignable_v<value_type>)
+      {
+        ttg_data_copy_t::operator=(c);
+        m_value = c.m_value;
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      value_type& operator*() {
+        return m_value;
       }
 
       /* will destruct the value */
@@ -110,4 +349,4 @@ namespace ttg_parsec {
 
 } // namespace ttg_parsec
 
-#endif // TTG_DATA_COPY_H
+#endif // TTG_DATA_COPY_H
\ No newline at end of file
diff --git a/ttg/ttg/ptr.h b/ttg/ttg/ptr.h
new file mode 100644
index 0000000000..3cc699f606
--- /dev/null
+++ b/ttg/ttg/ptr.h
@@ -0,0 +1,118 @@
+#ifndef TTG_PTR_H
+#define TTG_PTR_H
+
+#include "ttg/impl_selector.h"
+
+namespace ttg {
+
+template<typename T>
+using Ptr = TTG_IMPL_NS::ptr<T>;
+
+template<typename T, typename... Args>
+Ptr<T> make_ptr(Args&&... args) {
+  return TTG_IMPL_NS::make_ptr(std::forward<Args>(args)...);
+}
+
+template<typename T>
+auto get_ptr(T&& obj) {
+  return TTG_IMPL_NS::get_ptr(std::forward<T>(obj));
+}
+
+namespace detail {
+    template<typename T>
+    struct is_ptr : std::false_type
+    { };
+
+    template<typename T>
+    struct is_ptr<ttg::Ptr<T>> : std::true_type
+    { };
+
+    template<typename T>
+    constexpr bool is_ptr_v = is_ptr<T>::value;
+
+} // namespace detail
+
+#if 0
+namespace detail {
+
+    /* awaiter for ttg::get_ptr with multiple arguments
+     * operator co_wait will return the tuple of ttg::Ptr
+     */
+    template<typename... Ts>
+    struct get_ptr_tpl_t {
+    private:
+      std::tuple<ttg::Ptr<Ts>...> m_ptr_tuple;
+      bool m_is_ready = false;
+    public:
+      get_ptr_tpl_t(bool is_ready, std::tuple<ttg::ptr<Ts>...>&& ptrs)
+      : m_ptr_tuple(std::forward<std::tuple<ttg::Ptr<Ts>...>>(ptrs))
+      , m_is_ready(is_ready)
+      { }
+
+      bool await_ready() const noexcept {
+        return m_is_ready;
+      }
+
+      constexpr void await_suspend( std::coroutine_handle<> ) const noexcept {
+        /* TODO: anything to be done here? */
+      }
+
+      auto await_resume() const noexcept {
+        return std::move(m_ptr_tuple);
+      }
+    };
+
+
+    /* awaiter for ttg::get_ptr for a single argument */
+    template<typename T>
+    struct get_ptr_t {
+    private:
+      ttg::Ptr<T> m_ptr;
+      bool m_is_ready = false;
+    public:
+      get_ptr_t(bool is_ready, ttg::Ptr<T>&& ptr)
+      : m_ptr(std::forward<ttg::Ptr<T>>(ptr))
+      , m_is_ready(is_ready)
+      { }
+
+      bool await_ready() const noexcept {
+        return m_is_ready;
+      }
+
+      constexpr void await_suspend( std::coroutine_handle<> ) const noexcept {
+        /* TODO: anything to be done here? */
+      }
+
+      auto await_resume() const noexcept {
+        return std::move(m_ptr);
+      }
+    };
+  } // namespace detail
+
+  /**
+   * Get an awaiter that results in a ttg::Ptr to a task argument.
+   * Must only be called inside a task on a value that was passed
+   * to the task and has not yet been moved on.
+   * Should be used in conjunction with co_await, e.g.,
+   * ttg::Ptr<double> ptr = co_await ttg::get_ptr(val);
+   *
+   * Multiple value can be passed, which results in a tuple of ptr:
+   * ttg::Ptr<double> ptr1, ptr2;
+   * std::tie(ptr1, ptr2) = co_await ttg::get_ptr(val1, val2);
+   */
+  template<typename Arg, typename... Args>
+  auto get_ptr(Arg&& arg, Args&&... args) {
+    bool is_ready;
+    using tpl_type    = std::tuple<ttg::Ptr<std::decay_t<Arg>, std::decay<Args>...>>;
+    using result_type = std::pair<bool, tpl_type>;
+    result_type p = TTG_IMPL_NS::get_ptr(std::forward<Arg>(arg), std::forward<Args>(args)...);
+    if constexpr (sizeof...(Args) > 0) {
+      return detail::get_ptr_tpl_t<std::decay_t<Arg>, std::decay_t<Args>...>(p.first, std::move(p.second));
+    } else if constexpr (sizeof...(Args) == 0) {
+      return detail::get_ptr_t<std::decay_t<Arg>>(p.first, std::move(std::get<0>(p.second)));
+    }
+  }
+#endif // 0
+} // namespace ttg
+
+#endif // TTG_PTR_H
\ No newline at end of file
diff --git a/ttg/ttg/run.h b/ttg/ttg/run.h
index 21ec337e87..b4dc593de1 100644
--- a/ttg/ttg/run.h
+++ b/ttg/ttg/run.h
@@ -57,6 +57,7 @@ namespace ttg {
   inline void finalize() { TTG_IMPL_NS::ttg_finalize(); }
 
   /// Aborts the TTG program using the default backend's `ttg_abort` method
+  [[noreturn]]
   inline void abort() { TTG_IMPL_NS::ttg_abort(); }
 
   /// Accesses the default backend's default execution context
diff --git a/ttg/ttg/serialization/splitmd_data_descriptor.h b/ttg/ttg/serialization/splitmd_data_descriptor.h
index 8edfe20d6b..46bdb7b764 100644
--- a/ttg/ttg/serialization/splitmd_data_descriptor.h
+++ b/ttg/ttg/serialization/splitmd_data_descriptor.h
@@ -3,20 +3,10 @@
 
 #include <type_traits>
 #include "ttg/util/meta.h"
+#include "ttg/util/iovec.h"
 
 namespace ttg {
 
-  /**
-   * Used to describe transfer payload in types using the \sa SplitMetadataDescriptor.
-   * @c data Pointer to the data to be read from / written to.
-   * @c num_bytes The number of bytes to read from / write to the memory location
-   *                   \sa data.
-   */
-  struct iovec {
-    size_t num_bytes;
-    void* data;
-  };
-
   /**
    * SplitMetadataDescriptor is a serialization descriptor provided by the user
    * for a user-specified type. It should contain the following public member
diff --git a/ttg/ttg/terminal.h b/ttg/ttg/terminal.h
index e1e3e0c739..89b26741d4 100644
--- a/ttg/ttg/terminal.h
+++ b/ttg/ttg/terminal.h
@@ -155,6 +155,7 @@ namespace ttg {
     using broadcast_callback_type = meta::detail::broadcast_callback_t<keyT, std::decay_t<valueT>>;
     using setsize_callback_type = typename base_type::setsize_callback_type;
     using finalize_callback_type = typename base_type::finalize_callback_type;
+    using prepare_send_callback_type = meta::detail::prepare_send_callback_t<keyT, std::decay_t<valueT>>;
     static constexpr bool is_an_input_terminal = true;
     ttg::detail::ContainerWrapper<keyT, valueT> container;
 
@@ -162,6 +163,7 @@ namespace ttg {
     send_callback_type send_callback;
     move_callback_type move_callback;
     broadcast_callback_type broadcast_callback;
+    prepare_send_callback_type prepare_send_callback;
 
     // No moving, copying, assigning permitted
     In(In &&other) = delete;
@@ -189,10 +191,12 @@ namespace ttg {
     void set_callback(const send_callback_type &send_callback, const move_callback_type &move_callback,
                       const broadcast_callback_type &bcast_callback = broadcast_callback_type{},
                       const setsize_callback_type &setsize_callback = setsize_callback_type{},
-                      const finalize_callback_type &finalize_callback = finalize_callback_type{}) {
+                      const finalize_callback_type &finalize_callback = finalize_callback_type{},
+                      const prepare_send_callback_type &prepare_send_callback = prepare_send_callback_type{}) {
       this->send_callback = send_callback;
       this->move_callback = move_callback;
       this->broadcast_callback = bcast_callback;
+      this->prepare_send_callback = prepare_send_callback;
       base_type::set_callback(setsize_callback, finalize_callback);
     }
 
@@ -271,7 +275,7 @@ namespace ttg {
           for (auto &&key : keylist) send(key, v);
         } else {
           /* got something we cannot iterate over (single element?) so put one element in the span */
-          broadcast_callback(ttg::span<const keyT>(&keylist, 1), v);
+          send(ttg::span<const keyT>(&keylist, 1), v);
         }
       }
     }
@@ -291,7 +295,36 @@ namespace ttg {
           for (auto &&key : keylist) sendk(key);
         } else {
           /* got something we cannot iterate over (single element?) so put one element in the span */
-          broadcast_callback(ttg::span<const keyT>(&keylist, 1));
+          sendk(ttg::span<const keyT>(&keylist, 1));
+        }
+      }
+    }
+
+
+    template <typename rangeT, typename Value>
+    void prepare_send(const rangeT &keylist, Value &&value) {
+      const Value &v = value;
+      if (prepare_send_callback) {
+        if constexpr (ttg::meta::is_iterable_v<rangeT>) {
+          prepare_send_callback(ttg::span<const keyT>(&(*std::begin(keylist)),
+                                                      std::distance(std::begin(keylist), std::end(keylist))),
+                                v);
+        } else {
+          /* got something we cannot iterate over (single element?) so put one element in the span */
+          prepare_send_callback(ttg::span<const keyT>(&keylist, 1), v);
+        }
+      }
+    }
+
+    template <typename rangeT, typename Value>
+    void prepare_send(Value &&value) {
+      const Value &v = value;
+      if (prepare_send_callback) {
+        if constexpr (ttg::meta::is_iterable_v<rangeT>) {
+          prepare_send_callback(v);
+        } else {
+          /* got something we cannot iterate over (single element?) so put one element in the span */
+          prepare_send_callback(v);
         }
       }
     }
@@ -531,6 +564,32 @@ namespace ttg {
         }
       }
     }
+
+    template <typename rangeT, typename Key = keyT, typename Value = valueT>
+    std::enable_if_t<meta::is_none_void_v<Key> && !meta::is_void_v<valueT>, void>
+    prepare_send(const rangeT &keylist, const Value &value) {
+      for (auto &&successor : this->successors()) {
+        assert(successor->get_type() != TerminalBase::Type::Write);
+        if (successor->get_type() == TerminalBase::Type::Read) {
+          return static_cast<In<keyT, std::add_const_t<valueT>> *>(successor)->prepare_send(keylist, value);
+        } else if (successor->get_type() == TerminalBase::Type::Consume) {
+          return static_cast<In<keyT, valueT> *>(successor)->prepare_send(keylist, value);
+        }
+      }
+    }
+
+    template <typename rangeT, typename Key = keyT, typename Value = valueT>
+    std::enable_if_t<meta::is_none_void_v<Key> && !meta::is_void_v<valueT>, void>
+    prepare_send(const Value &value) {
+      for (auto &&successor : this->successors()) {
+        assert(successor->get_type() != TerminalBase::Type::Write);
+        if (successor->get_type() == TerminalBase::Type::Read) {
+          return static_cast<In<keyT, std::add_const_t<valueT>> *>(successor)->prepare_send(value);
+        } else if (successor->get_type() == TerminalBase::Type::Consume) {
+          return static_cast<In<keyT, valueT> *>(successor)->prepare_send(value);
+        }
+      }
+    }
   };
 
   namespace meta {
diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h
index 66048906a0..a6f3ef9d4d 100644
--- a/ttg/ttg/tt.h
+++ b/ttg/ttg/tt.h
@@ -1,16 +1,24 @@
 #ifndef TTG_TT_H
 #define TTG_TT_H
 
-#include <vector>
 #include <memory>
+#include <vector>
 
 #include "ttg/fwd.h"
 
 #include "ttg/base/tt.h"
 #include "ttg/edge.h"
 
+#ifdef TTG_HAS_COROUTINE
+#include "ttg/util/coroutine.h"
+#endif
+
 namespace ttg {
 
+  // TODO describe TT concept (preferably as a C++20 concept)
+  // N.B. TT::op returns void or ttg::coroutine_handle<>
+  // see TTG_PROCESS_TT_OP_RETURN below
+
   /// @brief a template task graph implementation
 
   /// It contains (owns) one or more TT objects. Since it can also be viewed as a TT object itself,
@@ -35,12 +43,16 @@ namespace ttg {
     TTG(const TTG &) = delete;
     TTG &operator=(const TTG &) = delete;
     // movable
-    TTG(TTG && other) : TTBase(static_cast<TTBase&&>(other)), tts(other.tts), ins(std::move(other.ins)), outs(std::move(other.outs)) {
-        is_ttg_ = true;
-        own_my_tts();
+    TTG(TTG &&other)
+        : TTBase(static_cast<TTBase &&>(other))
+        , tts(other.tts)
+        , ins(std::move(other.ins))
+        , outs(std::move(other.outs)) {
+      is_ttg_ = true;
+      own_my_tts();
     }
-    TTG& operator=(TTG &&other) {
-      static_cast<TTBase&>(*this) = static_cast<TTBase&&>(other);
+    TTG &operator=(TTG &&other) {
+      static_cast<TTBase &>(*this) = static_cast<TTBase &&>(other);
       is_ttg_ = true;
       tts = std::move(other.tts);
       ins = std::move(other.ins);
@@ -91,17 +103,14 @@ namespace ttg {
     void own_my_tts() const {
       for (auto &op : tts) op->owning_ttg = this;
     }
-
   };
 
   template <typename ttseqT, typename input_terminalsT, typename output_terminalsT>
-  auto make_ttg(ttseqT &&tts, const input_terminalsT &ins,
-                const output_terminalsT &outs,
+  auto make_ttg(ttseqT &&tts, const input_terminalsT &ins, const output_terminalsT &outs,
                 const std::string &name = "ttg") {
     return std::make_unique<TTG<input_terminalsT, output_terminalsT>>(std::forward<ttseqT>(tts), ins, outs, name);
   }
 
-
   /// A data sink for one input
   template <typename keyT, typename input_valueT>
   class SinkTT : public TTBase {
@@ -126,20 +135,20 @@ namespace ttg {
       using valueT = std::decay_t<typename terminalT::value_type>;
       auto move_callback = [](const keyT &key, valueT &&value) {};
       auto send_callback = [](const keyT &key, const valueT &value) {};
-      auto broadcast_callback = [](const ttg::span<const keyT>& key, const valueT &value) {};
+      auto broadcast_callback = [](const ttg::span<const keyT> &key, const valueT &value) {};
       auto setsize_callback = [](const keyT &key, std::size_t size) {};
       auto finalize_callback = [](const keyT &key) {};
 
       input.set_callback(send_callback, move_callback, broadcast_callback, setsize_callback, finalize_callback);
     }
 
-  public:
-   SinkTT(const std::string& inname="junk") : TTBase("sink", numins, numouts) {
+   public:
+    SinkTT(const std::string &inname = "junk") : TTBase("sink", numins, numouts) {
       register_input_terminals(input_terminals, std::vector<std::string>{inname});
       register_input_callback(std::get<0>(input_terminals));
     }
 
-    SinkTT(const input_edges_type &inedges, const std::string& inname="junk") : TTBase("sink", numins, numouts) {
+    SinkTT(const input_edges_type &inedges, const std::string &inname = "junk") : TTBase("sink", numins, numouts) {
       register_input_terminals(input_terminals, std::vector<std::string>{inname});
       register_input_callback(std::get<0>(input_terminals));
       std::get<0>(inedges).set_out(&std::get<0>(input_terminals));
@@ -154,12 +163,28 @@ namespace ttg {
     /// Returns pointer to input terminal i to facilitate connection --- terminal cannot be copied, moved or assigned
     template <std::size_t i>
     std::tuple_element_t<i, input_terminals_type> *in() {
-      static_assert(i==0);
+      static_assert(i == 0);
       return &std::get<i>(input_terminals);
     }
   };
 
+}  // namespace ttg
+
+#ifndef TTG_PROCESS_TT_OP_RETURN
+#ifdef TTG_HAS_COROUTINE
+#define TTG_PROCESS_TT_OP_RETURN(result, invoke)       \
+  {                                                    \
+    using return_type = decltype(invoke);              \
+    if constexpr (std::is_same_v<return_type, void>) { \
+      invoke;                                          \
+    } else {                                           \
+      auto coro_return = invoke;                       \
+      result = coro_return.address();                  \
+    }                                                  \
+  }
+#else
+#define TTG_PROCESS_TT_OP_RETURN(result, invoke) invoke
+#endif
+#endif  // !defined(TTG_PROCESS_TT_OP_RETURN)
 
-} // namespace ttg
-
-#endif // TTG_TT_H
+#endif  // TTG_TT_H
diff --git a/ttg/ttg/util/coroutine.h b/ttg/ttg/util/coroutine.h
new file mode 100644
index 0000000000..3577032f42
--- /dev/null
+++ b/ttg/ttg/util/coroutine.h
@@ -0,0 +1,194 @@
+//
+// Created by Eduard Valeyev on 10/31/22.
+//
+
+#ifndef TTG_UTIL_COROUTINE_H
+#define TTG_UTIL_COROUTINE_H
+
+#include "ttg/config.h"
+#include TTG_CXX_COROUTINE_HEADER
+
+#include <algorithm>
+#include <array>
+
+namespace ttg {
+
+  struct resumable_task_state;
+
+  // import coroutine_handle, with default promise redefined to resumable_task_state
+  template <typename Promise = resumable_task_state>
+  using coroutine_handle = TTG_CXX_COROUTINE_NAMESPACE::coroutine_handle<Promise>;
+
+  template <std::size_t N>
+  struct resumable_task_events;
+
+  /// represents a one-time event
+  struct event {
+    void finish() { finished_ = true; }
+
+    /// @return true if the event has occurred
+    bool finished() const { return finished_; }
+
+   private:
+    std::atomic<bool> finished_ = false;
+  };
+
+  /// task that can be resumed after some events occur
+  struct resumable_task : public ttg::coroutine_handle<> {
+    using base_type = ttg::coroutine_handle<>;
+
+    /// these are members mandated by the promise_type concept
+    ///@{
+
+    using promise_type = struct resumable_task_state;
+
+    ///@}
+
+    resumable_task(base_type base) : base_type(std::move(base)) {}
+
+    base_type handle() { return *this; }
+
+    /// @return true if ready to resume
+    inline bool ready() const;
+
+    /// @return true if task completed and can be destroyed
+    inline bool completed() const;
+
+    /// @return ttg::span of events that this task depends on
+    inline ttg::span<event*> events();
+  };
+
+  /// encapsulates the state of the coroutine object visible to the outside world
+  /// @note this is the `promise_type` for resumable_task coroutine
+  struct resumable_task_state {
+    resumable_task_state() noexcept = default;
+    // these only live on coroutine frames so make noncopyable and nonmovable
+    resumable_task_state(const resumable_task_state&) = delete;
+    resumable_task_state& operator=(const resumable_task_state&) = delete;
+    resumable_task_state(resumable_task_state&&) = delete;
+    resumable_task_state& operator=(resumable_task_state&&) = delete;
+
+    constexpr static inline std::size_t MaxNumEvents = 20;
+    using handle_type = coroutine_handle<>;
+
+    /// these are members mandated by the promise_type concept
+    ///@{
+
+    resumable_task get_return_object() { return resumable_task{handle_type::from_promise(*this)}; }
+
+    /// @note start task eagerly
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_never initial_suspend() noexcept { return {}; }
+
+    /// @note suspend task before destroying it so the runtime can know that the task is completed
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always final_suspend() noexcept {
+      completed_ = true;
+      return {};
+    }
+    void return_void() {}
+    void unhandled_exception() {}
+
+    ///@}
+
+    /// these are optional members of the promise_type concept
+    ///@{
+
+    // these can be used to use optional storage provided by the runtime (e.g. part of the runtime's task data struct)
+    // N.B. the existing buffer must be passed to operator new via TLS
+    //    void* operator new(std::size_t size)
+    //    {
+    //      return ::operator new(size);
+    //    }
+
+    // N.B. whether the external buffer was used by operator new must be passed via TLS
+    //    void operator delete(void* ptr, std::size_t size)
+    //    {
+    //      ::operator delete(ptr, size);
+    //    }
+
+    ///@}
+
+    /// @return true if ready to resume
+    constexpr bool ready() const {
+      for (std::size_t e = 0; e != nevents_; ++e)
+        if (!events_storage_[e]->finished()) return false;
+      return true;
+    }
+
+    /// @return true if the task is completed
+    constexpr bool completed() const { return completed_; }
+
+    ttg::span<event*> events() { return ttg::span(events_storage_.data(), nevents_); }
+
+   private:
+    std::array<event*, MaxNumEvents> events_storage_;
+    std::size_t nevents_;
+    bool completed_ = false;
+
+    template <std::size_t N>
+    friend struct resumable_task_events;
+
+    void reset_events() {
+      std::fill(events_storage_.begin(), events_storage_.begin() + nevents_, nullptr);
+      nevents_ = 0;
+    }
+
+    template <std::size_t N>
+    void set_events(const std::array<event*, N> events) {
+      static_assert(N <= MaxNumEvents);
+      std::copy(events.begin(), events.end(), events_storage_.begin());
+      nevents_ = N;
+    }
+  };
+
+  bool resumable_task::ready() const { return base_type::promise().ready(); }
+  bool resumable_task::completed() const { return base_type::promise().completed(); }
+  ttg::span<event*> resumable_task::events() { return base_type::promise().events(); }
+
+  /// statically-sized sequence of events on whose completion progress of a given task depends on
+  /// @note this is the `Awaiter` for resumable_task coroutine
+  ///       (the concept is not defined in the standard, see
+  ///       https://lewissbaker.github.io/2017/11/17/understanding-operator-co-await instead )
+  template <std::size_t N>
+  struct resumable_task_events {
+   private:
+    template <std::size_t... I>
+    constexpr bool await_ready(std::index_sequence<I...>) const {
+      return (std::get<I>(events_)->finished() && ...);
+    }
+
+   public:
+    template <typename... Events>
+    constexpr resumable_task_events(Events&&... events) : events_{(&events)...} {}
+
+    /// these are members mandated by the Awaiter concept
+    ///@{
+
+    constexpr bool await_ready() const { return await_ready(std::make_index_sequence<N>{}); }
+
+    void await_suspend(coroutine_handle<> pending_task) {
+      pending_task_ = pending_task;
+      pending_task_.promise().set_events(events_);
+    }
+
+    void await_resume() {
+      if (pending_task_) {
+        pending_task_.promise().reset_events();
+        pending_task_ = {};
+      }
+    }
+
+    ///@}
+
+   private:
+    std::array<event*, N> events_;
+    coroutine_handle<> pending_task_;
+  };  // resumable_task_events
+
+  // deduce the number of events properly
+  template <typename... Events>
+  resumable_task_events(Events&&...) -> resumable_task_events<sizeof...(Events)>;
+
+  static_assert(resumable_task_events<0>{}.await_ready() == true);
+}  // namespace ttg
+
+#endif  // TTG_UTIL_COROUTINE_H
diff --git a/ttg/ttg/util/iovec.h b/ttg/ttg/util/iovec.h
new file mode 100644
index 0000000000..2a8d672b32
--- /dev/null
+++ b/ttg/ttg/util/iovec.h
@@ -0,0 +1,21 @@
+#ifndef TTG_UTIL_IOVEC_H_
+#define TTG_UTIL_IOVEC_H_
+
+#include <cstdint>
+
+namespace ttg {
+
+  /**
+   * Used to describe transfer payload in types using the \sa SplitMetadataDescriptor.
+   * @member data Pointer to the data to be read from / written to.
+   * @member num_bytes The number of bytes to read from / write to the memory location
+   *                   \sa data.
+   */
+  struct iovec {
+    std::size_t num_bytes;
+    void* data;
+  };
+
+} // ttg
+
+#endif // TTG_UTIL_IOVEC_H_
diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h
index c197761182..389a9c8d5a 100644
--- a/ttg/ttg/util/meta.h
+++ b/ttg/ttg/util/meta.h
@@ -549,6 +549,15 @@ namespace ttg {
     template <typename T>
     constexpr bool is_tuple_v = is_tuple<T>::value;
 
+    template <typename>
+    struct is_span : std::false_type {};
+
+    template <typename T, std::size_t S>
+    struct is_span<ttg::span<T, S>> : std::true_type {};
+
+    template <typename T>
+    constexpr bool is_span_v = is_span<T>::value;
+
     template <template <class> class Pred, typename TupleT, std::size_t I, std::size_t... Is>
     struct predicate_index_seq_helper;
 
@@ -752,6 +761,8 @@ namespace ttg {
       template <typename Key, typename Value>
       using broadcast_callback_t = typename broadcast_callback<Key, Value>::type;
 
+
+
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       // setsize_callback_t<key> = std::function<void(const keyT &, std::size_t)> protected against void key
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -827,6 +838,31 @@ namespace ttg {
       template <typename... valueTs>
       using input_reducers_t = typename input_reducers<valueTs...>::type;
 
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      // prepare_send_callback_t<Key, Value> = std::function<int(const ttg::span<Key> &, const Value &)> protected against void key
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      template <typename Key, typename Value, typename Enabler = void>
+      struct prepare_send_callback;
+      template <typename Key, typename Value>
+      struct prepare_send_callback<Key, Value, std::enable_if_t<!is_void_v<Key> && !is_void_v<Value>>> {
+        using type = std::function<void(const ttg::span<const Key> &, const Value &)>;
+      };
+      template <typename Key, typename Value>
+      struct prepare_send_callback<Key, Value, std::enable_if_t<!is_void_v<Key> && is_void_v<Value>>> {
+        using type = std::function<void(const ttg::span<const Key> &)>;
+      };
+      template <typename Key, typename Value>
+      struct prepare_send_callback<Key, Value, std::enable_if_t<is_void_v<Key> && !is_void_v<Value>>> {
+        using type = std::function<void(const Value &)>;
+      };
+      template <typename Key, typename Value>
+      struct prepare_send_callback<Key, Value, std::enable_if_t<is_void_v<Key> && is_void_v<Value>>> {
+        using type = std::function<void()>;
+      };
+      template <typename Key, typename Value>
+      using prepare_send_callback_t = typename prepare_send_callback<Key, Value>::type;
+
+
     }  // namespace detail
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -857,6 +893,16 @@ namespace ttg {
     constexpr bool is_invocable_typelist_r_v<ReturnType, Callable, ttg::typelist<Args...>> =
         std::is_invocable_r_v<ReturnType, Callable, Args...>;
 
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // detects the return result of a Callable when invoked with the arguments given as a typelist
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    template <typename Callable, typename Typelist>
+    struct invoke_result_typelist {};
+    template <typename Callable, typename... Args>
+    struct invoke_result_typelist<Callable, ttg::typelist<Args...>> : std::invoke_result<Callable, Args...> {};
+    template <class F, class... ArgTypes>
+    using invoke_result_typelist_t = typename invoke_result_typelist<F, ArgTypes...>::type;
+
   }  // namespace meta
 }  // namespace ttg
 
diff --git a/ttg/ttg/util/meta/callable.h b/ttg/ttg/util/meta/callable.h
index 7cdafd445e..41b6f16725 100644
--- a/ttg/ttg/util/meta/callable.h
+++ b/ttg/ttg/util/meta/callable.h
@@ -31,12 +31,22 @@ namespace ttg::meta {
   template <typename Callable>
   constexpr inline bool is_generic_callable_v = is_generic_callable<Callable>::value;
 
+  /// callable_args<Callable> detects whether `Callable` is generic or not, and in the latter case
+  /// detects (using Boost.CallableTraits) its return and argument types.
+  /// callable_args<Callable> is a constexpr value of type
+  /// `std::pair<bool,ttg::typelist<ttg::typelist<ReturnType>,ttg::typelist<ArgTypes>>>`
+  /// where:
+  /// - the boolean indicates whether `Callable` is generic (true) or not (false),
+  /// - `ReturnType` is the return type of the `Callable` if it's generic, empty otherwise
+  /// - `ArgTypes`  is the argument types of the `Callable` if it's generic, empty otherwise
+  /// \tparam Callable a callable type
   template <typename Callable, typename Enabler = void>
-  constexpr std::pair<bool, ttg::typelist<>> callable_args = {true, {}};
+  constexpr std::pair<bool, std::pair<ttg::typelist<>, ttg::typelist<>>> callable_args = {true, {}};
 
   template <typename Callable>
   constexpr auto callable_args<Callable, ttg::meta::void_t<boost::callable_traits::args_t<Callable, ttg::typelist>>> =
-      std::pair<bool, boost::callable_traits::args_t<Callable, ttg::typelist>>{false, {}};
+      std::pair<bool, std::pair<ttg::typelist<boost::callable_traits::return_type_t<Callable>>,
+                                boost::callable_traits::args_t<Callable, ttg::typelist>>>{false, {}};
 
   //////////////////////////////////////
   // generic callables
@@ -57,6 +67,19 @@ namespace ttg::meta {
     return idx;
   }
 
+  /// detects argument and return types of a generic callable (\p func)
+  /// by trying each combination of types (\p argument_type_lists) for the respective arguments starting with
+  /// the combination corresponding to the given \p Ordinal
+  /// \tparam Ordinal a nonnegative integer specifying the ordinal of the type combination from \p Typelists to try;
+  ///                 maximum value is `(std::tuple_size_v<Typelists> * ...)`
+  /// \tparam Func a generic callable type
+  /// \tparam Typelists a pack of ttg::typelist's each of which specifies candidate types for the respective
+  ///         argument of \p Func
+  /// \param func a generic callable
+  /// \param argument_type_lists a ttg::typelist<Typelists...>
+  /// @note iterates over \p argument_type_lists in "row-major" order (i.e. last list in \p argument_type_lists
+  ///       is iterated first, etc.; the maxim
+  /// @return an object of type `ttg::typelist<ttg::typelist<ReturnType>,ttg::typelist<ArgTypes>>`
   template <std::size_t Ordinal, typename Func, typename... Typelists, std::size_t... ArgIdx>
   auto compute_arg_binding_types_impl(Func& func, typelist<Typelists...> argument_type_lists,
                                       std::index_sequence<ArgIdx...> arg_idx = {}) {
@@ -67,18 +90,34 @@ namespace ttg::meta {
     constexpr auto tensor_size = (extents[ArgIdx] * ...);
     static_assert(tensor_size >= Ordinal);
     if constexpr (tensor_size == Ordinal) {
-      return typelist<>{};
+      return typelist<typelist<>, typelist<>>{};
     } else {
       constexpr auto idx = ordinal2index(Ordinal, extents);
       auto args = typelist<std::tuple_element_t<idx[ArgIdx], std::tuple_element_t<ArgIdx, arg_typelists_t>>...>{};
-      if constexpr (is_invocable_typelist_v<Func, drop_void_t<decltype(args)>>) {
-        return args;
+      using args_sans_void_t = drop_void_t<decltype(args)>;
+      if constexpr (is_invocable_typelist_v<Func, args_sans_void_t>) {
+        using return_type = invoke_result_typelist_t<Func, args_sans_void_t>;
+        return ttg::typelist<ttg::typelist<return_type>, decltype(args)>{};
       } else {
         return compute_arg_binding_types_impl<Ordinal + 1>(func, argument_type_lists, arg_idx);
       }
     }
   }
 
+  /// detects argument types of a generic callable (\p func)
+  /// by trying each combination of types (\p argument_type_lists) for the respective arguments starting with
+  /// the combination corresponding to the given \p Ordinal . The callable is expected to return \p ReturnType
+  /// \tparam Ordinal a nonnegative integer specifying the ordinal of the type combination from \p Typelists to try;
+  ///                 maximum value is `(std::tuple_size_v<Typelists> * ...)`
+  /// \tparam ReturnType the expected return type of \p Func
+  /// \tparam Func a generic callable type
+  /// \tparam Typelists a pack of ttg::typelist's each of which specifies candidate types for the respective
+  ///         argument of \p Func
+  /// \param func a generic callable
+  /// \param argument_type_lists a ttg::typelist<Typelists...>
+  /// @note iterates over \p argument_type_lists in "row-major" order (i.e. last list in \p argument_type_lists
+  ///       is iterated first, etc.; the maxim
+  /// @return an object of type `ttg::typelist<ArgTypes>`
   template <std::size_t Ordinal, typename ReturnType, typename Func, typename... Typelists, std::size_t... ArgIdx>
   auto compute_arg_binding_types_r_impl(Func& func, typelist<Typelists...> argument_type_lists,
                                         std::index_sequence<ArgIdx...> arg_idx = {}) {
@@ -105,7 +144,9 @@ namespace ttg::meta {
   /// @tparam Typelists a pack of typelists encoding how each argument can be invoked
   /// @param func a reference to callable of type @p Func
   /// @param argument_type_lists a list of possible types to try for each argument; can contain `void`
-  /// @return a ttg::typelist encoding:
+  /// @return a `ttg::typelist<ttg::typelist<ReturnType>,ttg::typelist<ArgsTypes>>` where
+  ///          `ReturnType` is the return type of \p func and
+  ///         `ArgsTypes` encodes:
   ///         - the exact argument bindings used by `Func`, if @p func is a nongeneric callable;
   ///         - the first invocable combination of argument types discovered by row-major iteration, if @p func is a
   ///         generic callable
@@ -138,7 +179,7 @@ namespace ttg::meta {
       return compute_arg_binding_types_r_impl<0, ReturnType>(func, argument_type_lists,
                                                              std::make_index_sequence<sizeof...(Typelists)>{});
     } else {
-      return is_generic__args.second;
+      return is_generic__args.second.second;
     }
   }
 
diff --git a/ttg/ttg/util/multiindex.h b/ttg/ttg/util/multiindex.h
index 5762a43dca..34089d11ad 100644
--- a/ttg/ttg/util/multiindex.h
+++ b/ttg/ttg/util/multiindex.h
@@ -5,6 +5,8 @@
 #ifndef TTG_UTIL_MULTIINDEX_H
 #define TTG_UTIL_MULTIINDEX_H
 
+#include "ttg/serialization/std/array.h"
+
 namespace ttg {
 
   template <std::size_t Rank, typename Int = int>
@@ -53,6 +55,11 @@ namespace ttg {
       return data_[idx];
     }
 
+    template <typename Archive>
+    void serialize(Archive &ar, const unsigned int version = 0) {
+      ar &data_;
+    }
+
    private:
     bool valid() {
       bool result = true;
diff --git a/ttg/ttg/util/span.h b/ttg/ttg/util/span.h
index 812322c3b9..f551ead39f 100644
--- a/ttg/ttg/util/span.h
+++ b/ttg/ttg/util/span.h
@@ -1,17 +1,6 @@
 #ifndef TTG_UTIL_SPAN_H
 #define TTG_UTIL_SPAN_H
 
-#if __cplusplus >= 202002L
-
-/* Use the std:: implementation available from C++20 onwards */
-#include <span>
-namespace ttg {
-  template <class T, std::size_t Extent = std::dynamic_extent>
-  using span = std::span<T, Extent>;
-}  // namespace ttg
-
-#else  // __cplusplus >= 202002L
-
 /* if TCB_SPAN_NAMESPACE_NAME already defined someone imported TCB span somewhere,
  * then reset it here to the ttg namespace, don't forget to revert back */
 #ifdef TCB_SPAN_NAMESPACE_NAME
@@ -533,6 +522,4 @@ namespace std {
 
 #endif
 
-#endif  // __cplusplus >= 202002L
-
 #endif  // TTG_UTIL_SPAN_H
diff --git a/ttg/ttg/util/typelist.h b/ttg/ttg/util/typelist.h
index 0f2363b129..72126a62b0 100644
--- a/ttg/ttg/util/typelist.h
+++ b/ttg/ttg/util/typelist.h
@@ -26,7 +26,7 @@ namespace ttg {
       constexpr auto size() const { return sizeof...(Ts); }
 
       template <std::size_t I>
-      constexpr auto get() {
+      constexpr auto get() const {
         return type_identity<std::tuple_element_t<I, std::tuple<Ts...>>>{};
       }
     };
diff --git a/ttg/ttg/view.h b/ttg/ttg/view.h
new file mode 100644
index 0000000000..b921119322
--- /dev/null
+++ b/ttg/ttg/view.h
@@ -0,0 +1,942 @@
+#ifndef TTG_VIEW_H
+#define TTG_VIEW_H
+
+#include <array>
+#include <type_traits>
+#include <span>
+
+#include "ttg/fwd.h"
+#include "ttg/impl_selector.h"
+#include "ttg/ptr.h"
+
+namespace ttg {
+
+  enum class ViewScope {
+    Allocate     = 0x0,  //< memory allocated as scratch, but not moved in or out
+    Available    = 0x1,  //< data will be reused on device if available, transferred to device otherwise
+    SyncIn       = 0x2,  //< data will be allocated on and transferred to device
+                         //< if latest version resides on the device (no previous sync-out) the data will
+                         //< not be transferred again
+    SyncOut      = 0x4,  //< value will be transferred from device to host after kernel completes
+    SyncInOut    = 0x8,  //< data will be moved in and synchronized back out after the kernel completes
+    AvailableOut = 0x16, //< similar to Available and data is transferred back to device after kernel completes
+  };
+
+  /**
+   * A view span that can be type-punned.
+   * We use it instead of a std::span to be able
+   * to remove the type and convert to void pointers instead.
+   */
+  template<typename T, typename = void>
+  struct ViewPart;
+
+  template<>
+  struct ViewPart<void, void> {
+
+    using element_type  = void;
+    using value_type    = void;
+    using view_part_type = ViewPart<value_type>;
+
+    constexpr ViewPart() = default;
+    constexpr ViewPart(void* ptr, std::size_t size, ViewScope scope = ViewScope::SyncIn)
+    : m_data_ptr(ptr)
+    , m_size(size)
+    , m_scope(scope)
+    { }
+
+    constexpr std::size_t size() const {
+      return m_size;
+    }
+
+    constexpr void* data() const {
+      return m_data_ptr;
+    }
+
+    void set_data(void *ptr) {
+      m_data_ptr = ptr;
+    }
+
+    constexpr bool is_allocate() const {
+      return (m_scope == ViewScope::Allocate);
+    }
+
+    constexpr bool is_sync_in() const {
+      return !!(static_cast<int>(m_scope) & static_cast<int>(ViewScope::SyncIn));
+    }
+
+    constexpr bool is_sync_out() const {
+      return !!(static_cast<int>(m_scope) & static_cast<int>(ViewScope::SyncOut));
+    }
+
+    constexpr ViewScope scope() const {
+      return m_scope;
+    }
+
+  private:
+    void *m_data_ptr = nullptr;
+    std::size_t m_size = 0;
+    ViewScope m_scope = ViewScope::Allocate;
+  };
+
+
+  template<typename T>
+  struct ViewPart<T, void> : public ViewPart<void, void> {
+
+    using element_type  = T;
+    using value_type    = std::remove_cv_t<T>;
+    using view_part_type = ViewPart<value_type>;
+
+    constexpr ViewPart() = default;
+    constexpr ViewPart(T* ptr, std::size_t size, ViewScope scope = ViewScope::SyncIn)
+    : ViewPart<void, void>(ptr, size, scope)
+    { }
+
+    constexpr T* data() const {
+      return static_cast<T*>(m_data_ptr);
+    }
+
+  };
+
+  namespace detail {
+    template<typename T>
+    struct view_trait
+    {
+      static constexpr bool is_view = false;
+      static constexpr bool is_persistent = false;
+    };
+  } // namespace detail
+
+  template<typename HostT, typename DevT = HostT, typename... DevTypeTs>
+  struct View {
+
+    using span_tuple_type  = std::tuple<ttg::ViewPart<DevT>, ttg::ViewPart<DevTypeTs>...>;
+    using host_type = HostT;
+
+    using view_type = View<HostT, DevT, DevTypeTs...>;
+
+    constexpr static std::size_t num_spans = std::tuple_size_v<span_tuple_type>;
+
+  private:
+    template<std::size_t... Is>
+    View(HostT& obj, span_tuple_type& spans, std::index_sequence<Is...>)
+    : m_obj(&obj)
+    , m_spans({std::get<Is>(spans)...})
+    { }
+
+    View(HostT& obj, span_tuple_type spans)
+    : View(obj, spans, std::make_index_sequence<sizeof...(DevTypeTs)+1>{})
+    {
+      /* TODO: let the runtime handle the view */
+      //ttg::detail::register_view(*this);
+    }
+
+    /* hidden so that users cannot create views outside of a task */
+    template<typename T, typename... ArgsT>
+    friend auto make_view(T& obj, ViewPart<ArgsT>... spans);
+
+  public:
+
+    constexpr View() = delete;
+
+    /* move ctor deleted to prevent moving out of a task */
+    View(view_type&&) = delete;
+
+    /* copy ctor deleted to prevent copying out of a task */
+    View(const view_type&) = delete;
+
+    ~View() {
+      /* TODO: let the runtime remove the view */
+      //ttg::detail::drop_view(*this);
+    }
+
+    /* move operator deleted to prevent moving out of a task */
+    view_type& operator=(view_type&&) = delete;
+    /* copy operator deleted to prevent moving out of a task */
+    view_type& operator=(const view_type&) = delete;
+
+    template<std::size_t i>
+    auto get_device_ptr() {
+      return static_cast<std::tuple_element_t<i, span_tuple_type>::value_type*>(std::get<i>(m_spans).data());
+    }
+
+    template<std::size_t i>
+    const auto get_device_ptr() const {
+      return static_cast<std::tuple_element_t<i, span_tuple_type>::value_type>(std::get<i>(m_spans).data());
+    }
+
+    template<std::size_t i>
+    std::size_t get_device_size() const {
+      return std::get<i>(m_spans).size();
+    }
+
+    template<std::size_t i>
+    auto& get_ViewPart() const {
+      return std::get<i>(m_spans);
+    }
+
+    HostT& get_host_object() {
+      return *m_obj;
+    }
+
+    const HostT& get_host_object() const {
+      return *m_obj;
+    }
+
+    template<std::size_t i>
+    ViewScope get_scope() const {
+      return std::get<i>(m_spans).scope();
+    }
+
+    constexpr static std::size_t size() {
+      return num_spans;
+    }
+
+    /* return a std::span of type-punned ViewParts */
+    std::span<ViewPart<void>> view_spans() {
+      return {m_spans.begin(), m_spans.end()};
+    }
+
+  private:
+    HostT* m_obj = nullptr;
+    /* type-punned storage, cast to actual types in get_device_ptr */
+    std::array<ViewPart<void>, num_spans> m_spans;
+    //span_tuple_type m_spans{};
+  };
+
+  template<typename HostT, typename... ViewPartTs>
+  auto make_view(HostT& obj, ViewPart<ViewPartTs>... spans) {
+    return View(obj, std::make_tuple(std::move(spans)...));
+  }
+
+  /* overload for trivially-copyable host objects */
+  template<typename HostT, typename = std::enable_if_t<std::is_trivially_copyable_v<HostT>>>
+  auto make_view(HostT& obj, ViewScope scope = ViewScope::SyncIn) {
+    return make_view(obj, ViewPart<HostT>(&obj, sizeof(HostT), scope));
+  }
+
+  namespace detail {
+    template<typename HostT, typename... DevTs>
+    struct view_trait<View<HostT, DevTs...>>
+    {
+      static constexpr bool is_view = true;
+      static constexpr bool is_persistent = false;
+    };
+  } // namespace detail
+
+  /* yielded when waiting on a kernel to complete */
+  struct device_op_wait_kernel
+  { };
+
+  enum ttg_device_coro_state {
+    TTG_DEVICE_CORO_STATE_NONE,
+    TTG_DEVICE_CORO_INIT,
+    TTG_DEVICE_CORO_WAIT_TRANSFER,
+    TTG_DEVICE_CORO_WAIT_KERNEL,
+    TTG_DEVICE_CORO_COMPLETE
+  };
+
+  /* type-punned version of the View, providing access to the object
+   * pointer and a std::span over the views of that object */
+  struct device_obj_view {
+    using span_type = std::span<ViewPart<void>>;
+    using iterator = typename span_type::iterator;
+
+    device_obj_view(void *obj, span_type&& span)
+    : m_obj(obj)
+    , m_span(std::forward<span_type>(span))
+    { }
+
+    void *host_obj() {
+      return m_obj;
+    }
+
+    auto begin() {
+      return m_span.begin();
+    }
+
+    auto end() {
+      return m_span.end();
+    }
+
+  private:
+    void *m_obj;
+    span_type m_span;
+  };
+
+
+  /**
+   * A view that is persistent and can be copied in and out of the TTG
+   */
+  template<typename HostT, typename DevT = HostT, typename... DevTypeTs>
+  struct PersistentView {
+
+    using span_tuple_type  = std::tuple<ttg::ViewPart<DevT>, ttg::ViewPart<DevTypeTs>...>;
+    using host_type = HostT;
+
+    using view_type = PersistentView<HostT, DevT, DevTypeTs...>;
+
+    using ptr_type = ttg::ptr<host_type>;
+
+    constexpr static std::size_t num_spans = std::tuple_size_v<span_tuple_type>;
+
+  private:
+    template<std::size_t... Is>
+    PersistentView(ptr_type&& ptr, span_tuple_type& spans, std::index_sequence<Is...>)
+    : m_ptr(std::forward<ptr_type>(ptr))
+    , m_spans({std::get<Is>(spans)...})
+    { }
+
+  public:
+
+    constexpr PersistentView() = default;
+
+    PersistentView(ptr_type ptr, span_tuple_type spans)
+    : PersistentView(std::move(ptr), spans, std::make_index_sequence<sizeof...(DevTypeTs)+1>{})
+    {
+      /* TODO: let the runtime handle the view */
+      //ttg::detail::register_view(*this);
+    }
+
+    PersistentView(view_type&&) = default;
+
+    PersistentView(const view_type&) = default;
+
+    ~PersistentView() {
+      /* TODO: let the runtime remove the view */
+      //ttg::detail::drop_view(*this);
+    }
+
+    view_type& operator=(view_type&&) = default;
+    view_type& operator=(const view_type&) = default;
+
+    template<std::size_t i>
+    auto get_device_ptr() {
+      return static_cast<std::tuple_element_t<i, span_tuple_type>::value_type*>(std::get<i>(m_spans).data());
+    }
+
+    template<std::size_t i>
+    const auto get_device_ptr() const {
+      return static_cast<std::tuple_element_t<i, span_tuple_type>::value_type>(std::get<i>(m_spans).data());
+    }
+
+    template<std::size_t i>
+    std::size_t get_device_size() const {
+      return std::get<i>(m_spans).size();
+    }
+
+    template<std::size_t i>
+    auto& get_ViewPart() const {
+      return std::get<i>(m_spans);
+    }
+
+    HostT& get_host_object() {
+      return *m_ptr;
+    }
+
+    const HostT& get_host_object() const {
+      return *m_ptr;
+    }
+
+    template<std::size_t i>
+    ViewScope get_scope() const {
+      return std::get<i>(m_spans).scope();
+    }
+
+    constexpr static std::size_t size() {
+      return num_spans;
+    }
+
+    ptr_type get_ptr() const {
+      return m_ptr;
+    }
+
+    /* return a std::span of type-punned ViewParts */
+    std::span<ViewPart<void>> view_spans() {
+      return {m_spans.begin(), m_spans.end()};
+    }
+
+  private:
+    ptr_type m_ptr;
+    /* type-punned storage, cast to actual types in get_device_ptr */
+    std::array<ViewPart<void>, num_spans> m_spans;
+    //span_tuple_type m_spans{};
+  };
+
+  template<typename HostT, typename... ViewPartTs>
+  auto make_persistent_view(HostT& obj, ViewPart<ViewPartTs>... spans) {
+    return PersistentView(obj, std::make_tuple(std::move(spans)...));
+  }
+
+  /* overload for trivially-copyable host objects */
+  template<typename HostT, typename = std::enable_if_t<std::is_trivially_copyable_v<HostT>>>
+  auto make_persistent_view(HostT& obj, ViewScope scope = ViewScope::SyncIn) {
+    return PersistentView<HostT, HostT>(obj, std::make_tuple(ViewPart<HostT>(&obj, sizeof(HostT), scope)));
+  }
+
+
+  namespace detail {
+    template<typename HostT, typename... DevTs>
+    struct view_trait<PersistentView<HostT, DevTs...>>
+    {
+      static constexpr bool is_view = true;
+      static constexpr bool is_persistent = true;
+    };
+  } // namespace detail
+
+
+  namespace detail {
+    /* TODO: is this still needed? */
+    template<typename... Ts>
+    struct await_t {
+      std::tuple<Ts&...> ties;
+    };
+  }
+
+  template<typename... Args>
+  inline auto make_await(Args&&... args) {
+    return detail::await_t{std::tie(std::forward<Args>(args)...)};
+  }
+
+  namespace detail {
+    template<typename... Ts>
+    struct to_device_t {
+      std::tuple<Ts&...> ties;
+    };
+  } // namespace detail
+
+  template<typename... Args>
+  inline auto to_device(Args&&... args) {
+    return detail::to_device_t{std::tie(std::forward<Args>(args)...)};
+  }
+
+  namespace detail {
+    template<typename... Ts>
+    struct to_host_t {
+      std::tuple<Ts&...> ties;
+    };
+  } // namespace detail
+
+  template<typename... Args>
+  inline auto to_host(Args&&... args) {
+    return detail::to_host_t{std::tie(std::forward<Args>(args)...)};
+  }
+
+  namespace detail {
+    template<typename... Ts>
+    struct wait_kernel_t {
+      std::tuple<Ts&...> ties;
+
+      /* always suspend */
+      constexpr bool await_ready() const noexcept { return false; }
+
+      /* always suspend */
+      constexpr void await_suspend( std::coroutine_handle<> ) const noexcept {}
+
+      void await_resume() noexcept {
+        if constexpr (sizeof...(Ts) > 0) {
+          /* hook to allow the backend to handle the data after pushout */
+          TTG_IMPL_NS::post_device_out(ties);
+        }
+      }
+    };
+  } // namespace detail
+
+  /* Wait for kernel to complete and provided ttg::buffer and ttg::devicescratch
+   * to be transferred back to host */
+  template<typename... Buffers>
+  inline auto wait_kernel(Buffers&&... args) {
+    static_assert(((ttg::detail::is_buffer_v<std::decay_t<Buffers>>
+                    ||ttg::detail::is_devicescratch_v<std::decay_t<Buffers>>)&&...),
+                  "Only ttg::buffer and ttg::devicescratch can be waited on!");
+    return detail::wait_kernel_t<std::remove_reference_t<Buffers>...>{std::tie(std::forward<Buffers>(args)...)};
+  }
+
+
+  /* TODO: move all device code into ttg::device */
+  namespace device {
+
+    /******************************
+     * Send/Broadcast handling
+     * We pass the value returned by the backend's copy handler into a coroutine
+     * and execute the first part (prepare), before suspending it.
+     * The second part (send/broadcast) is executed after the task completed.
+     ******************************/
+
+    namespace detail {
+      struct send_coro_promise_type;
+
+      using send_coro_handle_type = TTG_CXX_COROUTINE_NAMESPACE::coroutine_handle<send_coro_promise_type>;
+
+      /// task that can be resumed after some events occur
+      struct send_coro_state : public send_coro_handle_type {
+        using base_type = send_coro_handle_type;
+
+        /// these are members mandated by the promise_type concept
+        ///@{
+
+        using promise_type = send_coro_promise_type;
+
+        ///@}
+
+        send_coro_state(base_type base) : base_type(std::move(base)) {}
+
+        base_type& handle() { return *this; }
+
+        /// @return true if ready to resume
+        inline bool ready() {
+          return true;
+        }
+
+        /// @return true if task completed and can be destroyed
+        inline bool completed();
+      };
+
+      struct send_coro_promise_type {
+
+        /* do not suspend the coroutine on first invocation, we want to run
+        * the coroutine immediately and suspend only once.
+        */
+        TTG_CXX_COROUTINE_NAMESPACE::suspend_never initial_suspend() {
+          return {};
+        }
+
+        /* we don't suspend the coroutine at the end.
+        * it can be destroyed once the send/broadcast is done
+        */
+        TTG_CXX_COROUTINE_NAMESPACE::suspend_never final_suspend() noexcept {
+          return {};
+        }
+
+        send_coro_state get_return_object() { return send_coro_state{send_coro_handle_type::from_promise(*this)}; }
+
+        /* the send coros only have an empty co_await */
+        TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(ttg::Void) {
+          return {};
+        }
+
+        void unhandled_exception() {
+
+        }
+
+      };
+
+      template<typename Key, typename Value, ttg::Runtime Runtime = ttg::ttg_runtime>
+      inline send_coro_state send_coro (const Key& key, Value&& value, ttg::Out<Key, Value> &t,
+                                        ttg::detail::value_copy_handler<Runtime>& ch) {
+        ttg::detail::value_copy_handler<Runtime> copy_handler = std::move(ch); // destroyed at the end of the coro
+        Key k = key;
+        t.prepare_send(k, std::forward<Value>(value));
+        co_await ttg::Void{}; // we'll come back once the task is done
+        t.send(k, std::forward<Value>(value));
+      };
+
+      template<typename Value, ttg::Runtime Runtime = ttg::ttg_runtime>
+      inline send_coro_state sendv_coro (Value&& value, ttg::Out<void, Value> &t,
+                                         ttg::detail::value_copy_handler<Runtime>& ch) {
+        ttg::detail::value_copy_handler<Runtime> copy_handler = std::move(ch); // destroyed at the end of the coro
+        t.prepare_send(std::forward<Value>(value));
+        co_await ttg::Void{}; // we'll come back once the task is done
+        t.sendv(std::forward<Value>(value));
+      };
+    } // namespace detail
+
+
+    /* functionality to deal with sending/broadcasting */
+    namespace detail {
+      struct send_t {
+        send_coro_state coro;
+      };
+    } // namespace detail
+
+    template <size_t i, typename keyT, typename valueT, typename... out_keysT, typename... out_valuesT,
+              ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline detail::send_t send(const keyT &key, valueT &&value,
+                                        std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+      ttg::detail::value_copy_handler<Runtime> copy_handler;
+      return detail::send_t{detail::send_coro(key, copy_handler(std::forward<valueT>(value)), std::get<i>(t), copy_handler)};
+    }
+
+    template <size_t i, typename valueT, typename... out_keysT, typename... out_valuesT,
+              ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline detail::send_t sendv(
+        valueT &&value, std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+      ttg::detail::value_copy_handler<Runtime> copy_handler;
+      return detail::send_t{detail::sendv_coro(copy_handler(std::forward<valueT>(value)), std::get<i>(t), copy_handler)};
+    }
+
+
+    // clang-format off
+    /// \brief Sends a task id and a value to the template tasks attached to the output terminal of this template task
+    /// \param[in] i Identifies which output terminal of this template task to select for sending
+    /// \param[in] key: the id of the task(s) receiving the value
+    /// \param[in] value: the value to send to the receiving task(s)
+    // clang-format on
+    template <typename keyT, typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline detail::send_t send(size_t i, const keyT &key, valueT &&value) {
+      ttg::detail::value_copy_handler<Runtime> copy_handler;
+      auto *terminal_ptr = ttg::detail::get_out_terminal<keyT, valueT>(i, "ttg::device::send(i, key, value)");
+      return detail::send_t{detail::send_coro(key, copy_handler(std::forward<valueT>(value)), *terminal_ptr, copy_handler)};
+    }
+
+    // clang-format off
+    /// \brief Sends a task id and a value to the template tasks attached to the output terminal of this template task
+    /// \note this is provided to support `send<i>` with and without explicitly-passed terminal tuple
+    /// \tparam <i> Identifies which output terminal of this template task to select for sending
+    /// \param[in] key: the id of the task(s) receiving the value
+    /// \param[in] value: the value to send to the receiving task(s)
+    // clang-format on
+    template <size_t i, typename keyT, typename valueT>
+    inline auto send(const keyT &key, valueT &&value) {
+      return send(i, key, std::forward<valueT>(value));
+    }
+
+
+
+    namespace detail {
+
+      template<typename T, typename Enabler = void>
+      struct broadcast_keylist_trait {
+        using type = T;
+      };
+
+      /* overload for iterable types that extracts the type of the first element */
+      template<typename T>
+      struct broadcast_keylist_trait<T, std::enable_if_t<ttg::meta::is_iterable_v<T>>> {
+        using key_type = decltype(*std::begin(std::get<0>(std::declval<T>())));
+      };
+
+      template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
+                typename... out_keysT, typename... out_valuesT>
+      inline void prepare_broadcast(const std::tuple<RangesT...> &keylists, valueT &&value,
+                                    std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+        std::get<I>(t)->prepare_send(std::get<KeyId>(keylists), std::forward<valueT>(value));
+        if constexpr (sizeof...(Is) > 0) {
+          detail::prepare_broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value), t);
+        }
+      }
+
+      template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
+                typename... out_keysT, typename... out_valuesT>
+      inline void prepare_broadcast(const std::tuple<RangesT...> &keylists, valueT &&value) {
+        using key_t = broadcast_keylist_trait<
+                        std::tuple_element_t<KeyId, std::tuple<std::remove_reference_t<RangesT>...>>
+                      >::key_type;
+        auto *terminal_ptr = ttg::detail::get_out_terminal<key_t, valueT>(I, "ttg::device::broadcast(keylists, value)");
+        terminal_ptr->prepare_send(std::get<KeyId>(keylists), value);
+        if constexpr (sizeof...(Is) > 0) {
+          detail::prepare_broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value));
+        }
+      }
+
+      template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
+                typename... out_keysT, typename... out_valuesT>
+      inline void broadcast(const std::tuple<RangesT...> &keylists, valueT &&value,
+                                    std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+        std::get<I>(t)->broadcast(std::get<KeyId>(keylists), std::forward<valueT>(value));
+        if constexpr (sizeof...(Is) > 0) {
+          detail::broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value), t);
+        }
+      }
+
+      template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
+                typename... out_keysT, typename... out_valuesT>
+      inline void broadcast(const std::tuple<RangesT...> &keylists, valueT &&value) {
+        using key_t = broadcast_keylist_trait<
+                        std::tuple_element_t<KeyId, std::tuple<std::remove_reference_t<RangesT>...>>
+                      >::key_type;
+        auto *terminal_ptr = ttg::detail::get_out_terminal<key_t, valueT>(I, "ttg::device::broadcast(keylists, value)");
+        terminal_ptr->broadcast(std::get<KeyId>(keylists), value);
+        if constexpr (sizeof...(Is) > 0) {
+          detail::broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value));
+        }
+      }
+
+      /* overload with explicit terminals */
+      template <size_t I, size_t... Is, typename RangesT, typename valueT,
+                typename... out_keysT, typename... out_valuesT,
+                ttg::Runtime Runtime = ttg::ttg_runtime>
+      inline send_coro_state
+      broadcast_coro(RangesT &&keylists, valueT &&value,
+                     std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t,
+                     ttg::detail::value_copy_handler<Runtime>&& ch) {
+        ttg::detail::value_copy_handler<Runtime> copy_handler = std::move(ch); // destroyed at the end of the coro
+        RangesT kl = std::forward<RangesT>(keylists); // capture the keylist(s)
+        if constexpr (ttg::meta::is_tuple_v<RangesT>) {
+          // treat as tuple
+          prepare_broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value), t);
+          co_await ttg::Void{}; // we'll come back once the task is done
+          broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value), t);
+        } else if constexpr (!ttg::meta::is_tuple_v<RangesT>) {
+          // create a tie to the captured keylist
+          prepare_broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value), t);
+          co_await ttg::Void{}; // we'll come back once the task is done
+          broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value), t);
+        }
+      }
+
+      /* overload with implicit terminals */
+      template <size_t I, size_t... Is, typename RangesT, typename valueT,
+                ttg::Runtime Runtime = ttg::ttg_runtime>
+      inline send_coro_state
+      broadcast_coro(RangesT &&keylists, valueT &&value,
+                     ttg::detail::value_copy_handler<Runtime>&& ch) {
+        ttg::detail::value_copy_handler<Runtime> copy_handler = std::move(ch); // destroyed at the end of the coro
+        RangesT kl = std::forward<RangesT>(keylists); // capture the keylist(s)
+        if constexpr (ttg::meta::is_tuple_v<RangesT>) {
+          // treat as tuple
+          static_assert(sizeof...(Is)+1 == std::tuple_size_v<RangesT>,
+                        "Size of keylist tuple must match the number of output terminals");
+          prepare_broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value));
+          co_await ttg::Void{}; // we'll come back once the task is done
+          broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value));
+        } else if constexpr (!ttg::meta::is_tuple_v<RangesT>) {
+          // create a tie to the captured keylist
+          prepare_broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value));
+          co_await ttg::Void{}; // we'll come back once the task is done
+          broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value));
+        }
+      }
+    }  // namespace detail
+
+    /* overload with explicit terminals and keylist passed by const reference */
+    template <size_t I, size_t... Is, typename rangeT, typename valueT, typename... out_keysT, typename... out_valuesT,
+              ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline detail::send_t broadcast(rangeT &&keylist,
+                                    valueT &&value,
+                                    const std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+      ttg::detail::value_copy_handler<Runtime> copy_handler;
+      return detail::send_t{
+              broadcast_coro<0, I, Is...>(std::forward<rangeT>(keylist),
+                                          copy_handler(std::forward<valueT>(value)),
+                                          t, std::move(copy_handler))};
+
+    }
+
+    /* overload with implicit terminals and keylist passed by const reference */
+    template <size_t i, typename rangeT, typename valueT,
+              ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline detail::send_t broadcast(rangeT &&keylist, valueT &&value) {
+      ttg::detail::value_copy_handler<Runtime> copy_handler;
+      return detail::send_t{broadcast_coro<0, i>(std::tie(keylist), copy_handler(std::forward<valueT>(value)),
+                                                 std::move(copy_handler))};
+    }
+
+    template<typename... Args, ttg::Runtime Runtime = ttg::ttg_runtime>
+    std::vector<device::detail::send_t> forward(Args&&... args) {
+      // TODO: check the cost of this!
+      return std::vector{std::forward<Args>(args)...};
+    }
+
+  } // namespace device
+
+  /*******************************************
+   * Device task promise and coroutine handle
+   *******************************************/
+
+
+  struct device_task_promise_type;
+
+  using device_task_handle_type = TTG_CXX_COROUTINE_NAMESPACE::coroutine_handle<device_task_promise_type>;
+
+  /// task that can be resumed after some events occur
+  struct device_task : public device_task_handle_type {
+    using base_type = device_task_handle_type;
+
+    /// these are members mandated by the promise_type concept
+    ///@{
+
+    using promise_type = device_task_promise_type;
+
+    ///@}
+
+    device_task(base_type base) : base_type(std::move(base)) {}
+
+    base_type& handle() { return *this; }
+
+    /// @return true if ready to resume
+    inline bool ready() {
+      return true;
+    }
+
+    /// @return true if task completed and can be destroyed
+    inline bool completed();
+  };
+
+  /* The promise type that stores the views provided by the
+   * application task coroutine on the first co_yield. It subsequently
+   * tracks the state of the task when it moves from waiting for transfers
+   * to waiting for the submitted kernel to complete. */
+  struct device_task_promise_type {
+
+    /* do not suspend the coroutine on first invocation, we want to run
+     * the coroutine immediately and suspend when we get the device transfers.
+     */
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_never initial_suspend() {
+      m_state = TTG_DEVICE_CORO_INIT;
+      return {};
+    }
+
+    /* suspend the coroutine at the end of the execution
+     * so we can access the promise.
+     * TODO: necessary? maybe we can save one suspend here
+     */
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always final_suspend() noexcept {
+      m_state = TTG_DEVICE_CORO_COMPLETE;
+      return {};
+    }
+
+#if 0
+    /* waiting for transfers to complete should always suspend
+     * TODO: as an optimization, we could check here if all data
+     *       is already available and avoid suspending...
+     */
+    template<typename... Views>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always yield_value(std::tuple<Views&...> &views) {
+      /* gather all the views (host object + view spans, type-punned) into a vector */
+      constexpr static std::size_t view_count = std::tuple_size_v<std::tuple<Views...>>;
+      std::cout << "yield_value: views" << std::endl;
+      m_spans.clear(); // in case we ever come back here
+      m_spans.reserve(view_count);
+      if constexpr(view_count > 0) {
+        auto unpack_lambda = [&](Views&... view){
+                                ((m_spans.push_back(device_obj_view(&view.get_host_object(),
+                                                                    view.view_spans()))),
+                                  ...);
+                              };
+        std::apply(unpack_lambda, views);
+      }
+      m_state = TTG_DEVICE_CORO_WAIT_TRANSFER;
+      return {};
+    }
+#endif // 0
+
+    /* convenience-function to yield a single view */
+    template<typename HostT, typename... DeviceViewTs>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always yield_value(View<HostT, DeviceViewTs...> &view) {
+      auto tmp_tuple = std::tie(view);
+      return yield_value(tmp_tuple);
+    }
+
+    /* convenience-function to yield a single view */
+    template<typename HostT, typename... DeviceViewTs>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always yield_value(PersistentView<HostT, DeviceViewTs...> &view) {
+      auto tmp_tuple = std::tie(view);
+      return yield_value(tmp_tuple);
+    }
+
+    /* waiting for the kernel to complete should always suspend */
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always yield_value(device_op_wait_kernel) {
+      std::cout << "yield_value: device_op_wait_kernel" << std::endl;
+      m_state = TTG_DEVICE_CORO_WAIT_KERNEL;
+      return {};
+    }
+
+    /* Allow co_await on a tuple */
+    template<typename... Views>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(std::tuple<Views&...> &views) {
+      return yield_value(views);
+    }
+
+    /* convenience-function to await a single view */
+    template<typename HostT, typename... DeviceViewTs>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(View<HostT, DeviceViewTs...> &view) {
+      auto tmp_tuple = std::tie(view);
+      return yield_value(tmp_tuple);
+    }
+
+    /* convenience-function to await a single view */
+    template<typename HostT, typename... DeviceViewTs>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(PersistentView<HostT, DeviceViewTs...> &view) {
+      auto tmp_tuple = std::tie(view);
+      return yield_value(tmp_tuple);
+    }
+
+    /* co_await for the kernel to complete should always suspend */
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(device_op_wait_kernel) {
+      std::cout << "yield_value: device_op_wait_kernel" << std::endl;
+      m_state = TTG_DEVICE_CORO_WAIT_KERNEL;
+      return {};
+    }
+
+    template<typename... Ts>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(detail::await_t<Ts...>&& a) {
+      bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.ties));
+      /* TODO: are we allowed to not suspend here and launch the kernel directly? */
+      m_state = TTG_DEVICE_CORO_WAIT_TRANSFER;
+      return {};
+    }
+
+    template<typename... Ts>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(detail::to_device_t<Ts...>&& a) {
+      bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.ties));
+      /* TODO: are we allowed to not suspend here and launch the kernel directly? */
+      m_state = TTG_DEVICE_CORO_WAIT_TRANSFER;
+      return {};
+    }
+
+    template<typename... Ts>
+    auto await_transform(detail::wait_kernel_t<Ts...>&& a) {
+      std::cout << "yield_value: wait_kernel_t" << std::endl;
+      if constexpr (sizeof...(Ts) > 0) {
+        TTG_IMPL_NS::mark_device_out(a.ties);
+      }
+      m_state = TTG_DEVICE_CORO_WAIT_KERNEL;
+      return a;
+    }
+
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(std::vector<device::detail::send_t>&& v) {
+      m_sends = std::forward<std::vector<device::detail::send_t>>(v);
+      m_state = TTG_DEVICE_CORO_COMPLETE;
+      return {};
+    }
+
+#if 0
+    void return_value(std::vector<device::detail::send_t>&& sends) {
+      m_sends = std::forward<std::vector<device::detail::send_t>>(sends);
+      m_state = TTG_DEVICE_CORO_COMPLETE;
+    }
+#endif // 0
+
+    void return_void() {
+      m_state = TTG_DEVICE_CORO_COMPLETE;
+    }
+
+    bool complete() const {
+      return m_state == TTG_DEVICE_CORO_COMPLETE;
+    }
+
+    device_task get_return_object() { return device_task{device_task_handle_type::from_promise(*this)}; }
+
+    void unhandled_exception() {
+
+    }
+
+    using iterator = std::vector<device_obj_view>::iterator;
+
+    /* execute all pending send and broadcast operations */
+    void do_sends() {
+      for (auto& send : m_sends) {
+        send.coro();
+      }
+      m_sends.clear();
+    }
+
+    auto state() {
+      return m_state;
+    }
+
+  private:
+    std::vector<device::detail::send_t> m_sends;
+    ttg_device_coro_state m_state = TTG_DEVICE_CORO_STATE_NONE;
+
+  };
+
+  bool device_task::completed() { return base_type::promise().state() == TTG_DEVICE_CORO_COMPLETE; }
+
+  struct device_wait_kernel
+  { };
+
+}  // namespace ttg
+
+
+#endif // TTG_VIEW_H