diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index a65f13cc17..0c3cb61f9f 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -13,8 +13,10 @@ jobs: os : [ macos-latest, ubuntu-20.04 ] include: - os: ubuntu-20.04 - cxx: /usr/bin/g++-9 + cc: /usr/bin/gcc-10 + cxx: /usr/bin/g++-10 - os: macos-latest + cc: clang cxx: clang++ name: "${{ matrix.os }}: ${{ matrix.cxx }} ${{ matrix.build_type }}" @@ -36,6 +38,7 @@ jobs: -DMPIEXEC_PREFLAGS='--bind-to;none;--allow-run-as-root' -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/install -DTTG_EXAMPLES=ON + -DCMAKE_CXX_STANDARD=20 steps: - uses: actions/checkout@v2 @@ -50,7 +53,7 @@ jobs: wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" sudo apt-get update - sudo apt-get -y install ninja-build g++-9 liblapack-dev libboost-dev libboost-serialization-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake + sudo apt-get -y install ninja-build g++-10 liblapack-dev libboost-dev libboost-serialization-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake - name: Create Build Environment # Some projects don't allow in-source building, so create a separate build directory @@ -99,7 +102,8 @@ jobs: # Note the current convention is to use the -S and -B options here to specify source # and build directories, but this is only available with CMake 3.13 and higher. # The CMake binaries on the Github Actions machines are (as of this writing) 3.12 - run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG + run: | + cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log) - name: Build working-directory: ${{github.workspace}}/build @@ -124,7 +128,7 @@ jobs: working-directory: ${{github.workspace}}/build shell: bash run: | - cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/main -B test_install_devsamp -DCMAKE_PREFIX_PATH=${{github.workspace}}/install + cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/main -B test_install_devsamp -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp/CMakeFiles/CMakeOutput.log && cat test_install_devsamp/CMakeFiles/CMakeError.log) cmake --build test_install_devsamp cmake -E make_directory test_install_userexamples cat > test_install_userexamples/CMakeLists.txt <` will refer to the ``Final`` version. + + +Imported Targets +**************** + +.. imp-target:: std::coroutine + + The ``std::coroutine`` imported target is defined when any requested + version of the C++ coroutine library has been found, whether it is + *Experimental* or *Final*. + + If no version of the coroutine library is available, this target will not + be defined. + + .. note:: + This target has ``cxx_std_20`` as an ``INTERFACE`` + :ref:`compile language standard feature `. Linking + to this target will automatically enable C++20 if no later standard + version is already required on the linking target. + + +.. coro.variables: + +Variables +********* + +.. variable:: CXX_COROUTINE_COMPONENT + + Set to ``Final`` when the :find-component:`coro.Final` version of C++ + coroutine library was found, ``Experimental`` when + the :find-component:`coro.Experimental` version of C++ + coroutine library was found, otherwise not defined. + +.. variable:: CXX_COROUTINE_HAVE_CORO + + Set to ``TRUE`` when a coroutine header was found. + +.. variable:: CXX_COROUTINE_HEADER + + Set to either ``coroutine`` or ``experimental/coroutine`` depending on + whether :find-component:`coro.Final` or :find-component:`coro.Experimental` was + found. + +.. variable:: CXX_COROUTINE_NAMESPACE + + Set to either ``std::coroutine`` or ``std::experimental::coroutine`` + depending on whether :find-component:`coro.Final` or + :find-component:`coro.Experimental` was found. + + +Examples +******** + +Using `find_package(Coroutine)` with no component arguments: + +.. code-block:: cmake + + find_package(Coroutine REQUIRED) + + add_executable(my-program main.cpp) + target_link_libraries(my-program PRIVATE std::coroutine) + + +#]=======================================================================] + + +if(TARGET std::coroutine) + # This module has already been processed. Don't do it again. + return() +endif() + +include(CMakePushCheckState) +include(CheckIncludeFileCXX) +include(CheckCXXSourceCompiles) + +cmake_push_check_state() + +set(CMAKE_REQUIRED_QUIET ${CXXStdCoroutine_FIND_QUIETLY}) + +# Normalize and check the component list we were given +set(CXXStdCoroutines_want_components ${CXXStdCoroutine_FIND_COMPONENTS}) +if(CXXStdCoroutine_FIND_COMPONENTS STREQUAL "") + set(CXXStdCoroutines_want_components Final) +endif() + +# Warn on any unrecognized components +set(CXXStdCoroutines_extra_components ${CXXStdCoroutines_want_components}) +list(REMOVE_ITEM CXXStdCoroutines_extra_components Final Experimental) +foreach(component IN LISTS CXXStdCoroutines_extra_components) + message(WARNING "Extraneous find_package component for CXXStdCoroutine: ${component}") +endforeach() + +# clang may need to use -stdlib=c++ to have coroutines +# gcc/libstdc++ needs -fcoroutines +set(CXXStdCoroutines_find_options "" "-stdlib=libc++" "-fcoroutines") +set(CXXStdCoroutines_std_options "" "-std=c++20" "-std=c++2a") +set(CXXStdCoroutines_want_components_ordered "${CXXStdCoroutines_want_components}") +list(SORT CXXStdCoroutines_want_components_ordered ORDER DESCENDING) # Final before Experimental + +foreach(component IN LISTS CXXStdCoroutines_want_components_ordered) + if(component STREQUAL "Final") + set(_coro_header coroutine) + set(_coro_namespace std) + else() + set(_coro_header experimental/coroutine) + set(_coro_namespace std::experimental) + endif() + foreach(option IN LISTS CXXStdCoroutines_find_options) + foreach(stdoption IN LISTS CXXStdCoroutines_std_options) + cmake_push_check_state() + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${option} ${stdoption}") + + string(CONFIGURE [[ + #include <@_coro_header@> + + int main() { + auto x = @_coro_namespace@::suspend_always{}; + return 0; + } + ]] code @ONLY) + + check_cxx_source_compiles("${code}" HAVE_USABLE_${_coro_header}) + mark_as_advanced(HAVE_USABLE_${_coro_header}) + cmake_pop_check_state() + if(HAVE_USABLE_${_coro_header}) + add_library(std::coroutine INTERFACE IMPORTED GLOBAL) + target_compile_features(std::coroutine INTERFACE cxx_std_20) + if (option) + target_compile_options(std::coroutine INTERFACE "${option}") + endif() + set(CXX_COROUTINE_COMPONENT "${component}" CACHE STRING "The component of CXXStdCoroutine package found") + # break out of this loop + break() + else() + unset(HAVE_USABLE_${_coro_header} CACHE) + endif() + endforeach() # stdoption + if (TARGET std::coroutine) + break() + endif() + endforeach() # option + if (TARGET std::coroutine) + break() + endif() +endforeach() # components + +set(CXX_COROUTINE_HAVE_CORO ${HAVE_USABLE_${_coro_header}} CACHE BOOL "TRUE if we have usable C++ coroutine headers") +set(CXX_COROUTINE_HEADER ${_coro_header} CACHE STRING "The header that should be included to obtain the coroutine APIs") +set(CXX_COROUTINE_NAMESPACE ${_coro_namespace} CACHE STRING "The C++ namespace that contains the coroutine APIs") + +cmake_pop_check_state() + +set(CXXStdCoroutine_FOUND ${HAVE_USABLE_${_coro_header}} CACHE BOOL "TRUE if we have usable C++ coroutine headers" FORCE) + +if(CXXStdCoroutine_FIND_REQUIRED AND NOT TARGET std::coroutine) + message(FATAL_ERROR "Cannot discover std::coroutine headers and/or compile simple program using std::coroutine") +endif() diff --git a/cmake/modules/FindCXXStdExecution.cmake b/cmake/modules/FindCXXStdExecution.cmake index 059368d1f1..acb6b1730d 100644 --- a/cmake/modules/FindCXXStdExecution.cmake +++ b/cmake/modules/FindCXXStdExecution.cmake @@ -9,8 +9,8 @@ FindCXXStdExecution ############## -This module supports the C++17 standard library's execution utilities. Use the -:imp-target:`std::execution` imported target to +This module supports the C++17 standard library's execution utilities. Link your target to the +:imp-target:`std::execution` imported target to provide standard C++ execution API. Imported Targets **************** @@ -69,9 +69,6 @@ cmake_push_check_state() set(CMAKE_REQUIRED_QUIET ${CXXStdExecution_FIND_QUIETLY}) -# All of our tests required C++17 or later -set(CMAKE_CXX_STANDARD 17) - set(CXXStdExecution_FOUND FALSE) # We have execution header, but how do we use it? Do link checks @@ -87,7 +84,7 @@ string(CONFIGURE [[ } ]] code @ONLY) -# Try to compile a simple filesystem program without any linker flags +# Try to compile a simple execution program without any linker flags check_cxx_source_compiles("${code}" CXX_EXECUTION_NO_LINK_NEEDED) set(CXXStdExecution_CAN_LINK ${CXX_EXECUTION_NO_LINK_NEEDED}) diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake index 1d18b66b5b..e2854e591a 100644 --- a/cmake/modules/FindOrFetchPARSEC.cmake +++ b/cmake/modules/FindOrFetchPARSEC.cmake @@ -16,7 +16,7 @@ if (NOT TARGET PaRSEC::parsec) FetchContent_Declare( PARSEC - GIT_REPOSITORY https://github.com/TESSEOrg/parsec.git + GIT_REPOSITORY https://github.com/therault/parsec.git GIT_TAG ${TTG_TRACKED_PARSEC_TAG} ) FetchContent_MakeAvailable(PARSEC) diff --git a/cmake/ttg-config.cmake.in b/cmake/ttg-config.cmake.in index b00a9c692f..41663c806b 100644 --- a/cmake/ttg-config.cmake.in +++ b/cmake/ttg-config.cmake.in @@ -7,11 +7,16 @@ set(TTG_EXT_VERSION "@TTG_EXT_VERSION@") set(PaRSEC_CONFIG "@PaRSEC_CONFIG@") set(MADNESS_CONFIG "@MADNESS_CONFIG@") set(Boost_CONFIG "@Boost_CONFIG@") +set(CXX_COROUTINE_COMPONENT "@CXX_COROUTINE_COMPONENT@") set(TTG_TRACKED_BOOST_VERSION "@TTG_TRACKED_BOOST_VERSION@") set(TTG_IGNORE_BUNDLED_EXTERNALS @TTG_IGNORE_BUNDLED_EXTERNALS@) +# make TTG CMake modules discoverable + load AddTTGExecutable by default +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules") +include(AddTTGExecutable) + @PACKAGE_INIT@ if (NOT TARGET MADworld AND MADNESS_CONFIG) @@ -24,12 +29,17 @@ if (NOT TARGET PaRSEC::parsec AND PaRSEC_CONFIG) find_package(PaRSEC CONFIG QUIET REQUIRED COMPONENTS parsec PATHS "${PaRSEC_CONFIG_DIR}" NO_DEFAULT_PATH) endif() -# N.B. load Boost +# if Boost was discovered and used at TTG configure time discover Boost at the same path if (NOT TARGET Boost::boost AND Boost_CONFIG) get_filename_component(Boost_CONFIG_DIR "${Boost_CONFIG}" DIRECTORY) find_package(Boost ${TTG_TRACKED_BOOST_VERSION} CONFIG QUIET REQUIRED OPTIONAL_COMPONENTS serialization PATHS "${Boost_CONFIG_DIR}" NO_DEFAULT_PATH) endif() +# if C++ coroutines were used discover same version of them +if (NOT TARGET std::coroutine AND CXX_COROUTINE_COMPONENT) + find_package(CXXStdCoroutine MODULE QUIET REQUIRED COMPONENTS "${CXX_COROUTINE_COMPONENT}") +endif() + # Include library IMPORT targets if(NOT TARGET ttg) include("${CMAKE_CURRENT_LIST_DIR}/ttg-targets.cmake") @@ -47,8 +57,4 @@ if (NOT TARGET Boost::boost) endif() endif() -# load CMake modules -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules") -include(AddTTGExecutable) - set(TTG_FOUND TRUE) diff --git a/doc/dox/contrib/CI-Administration-Notes.md b/doc/dox/contrib/CI-Administration-Notes.md index 32542b9709..c5dd9430af 100644 --- a/doc/dox/contrib/CI-Administration-Notes.md +++ b/doc/dox/contrib/CI-Administration-Notes.md @@ -4,7 +4,7 @@ * TTG uses GitHub Actions (GHA) for its CI service * GHA CI configuration is in file `.github/workflows/cmake.yml`. Only Linux and MacOS builds are currently supported. * Unlike earlier CI setups, there is no need to cache TTG prerequisites; default system-wide packages are used for most prerequisites, and the rest is compiled from source every time. -* Doxygen documentation deployment uses a Github token that is defined as variable `GH_TTG_TOKEN` in GHA's TTG repo settings' [secrets](https://github.com/TESSEorg/ttg/settings/secrets/actions). +* Doxygen documentation deployment uses a GitHub token that is defined as variable `GH_TTG_TOKEN` in GHA's TTG repo settings' [secrets](https://github.com/TESSEorg/ttg/settings/secrets/actions). # Debugging GitHub Actions jobs diff --git a/doc/dox/contrib/Design-Device.md b/doc/dox/contrib/Design-Device.md new file mode 100644 index 0000000000..2c40f64fda --- /dev/null +++ b/doc/dox/contrib/Design-Device.md @@ -0,0 +1,175 @@ +# Device Task Design {#Design-Device} + +## problem statement +TTG must be able to execute general user-defined graphs on machines with heterogeneous execution and address spaces, e.g., using multiple processes each having multiple CPU threads + device streams, with each thread/stream preferring or limited to a specific address range. + +## key concerns +The key issues are how to manage: +- the asynchrony of the device programming models, and +- the heterogeneity of the address space. + +There are multiple "solutions" to each issue, hence there are many possible designs. I'll discuss each issue first, then outline the aggregate designs we are pursuing. + +### Memory: +- *Unified Memory (UM)*: where available, use single address space (unified memory visible to both host and device executors; it is also possible to use pinned host memory for device calls) + - pro: simplifies memory management by removing the capacity limitation + - con: still needs user cooperation: all compute data must be allocated on UM heap, this impacts the design of + user data types, e.g. making them allocator aware, etc. + - con: the user will likely needs to use pooled memory management for efficiency reasons (e.g., TiledArray uses Umpire) + - con: still necessary to provide hints to the kernel driver managing UM. + - con: reasoning about UM driver performance is difficult, its details are opaque and platform dependent. +- *Device Memory (DM)*: using "native" device memory. + - pro: simpler performance model due to greatest amount of control (by runtime) over execution + - pro: can work with stack-capable data types + - con: The amount is limited, hence this memory must be explicitly managed (akin to how a cache is managed). + +Additional memory-related concerns common to both models: +- only partial state needs to be transferred to/from the device + - which part of the state will differ from algorithm to algorithm, hence encoding/constructing such representation cannot use constexpr code (such as traits) + - the need for _explicit_ handling of object's partial state is shared by both models + - UM: such optimization may seem automatic (only the pages of the data actually used on the device are transfered) but in practice the data must be explicitly prefetched, hence partial state transfers are not automatic; furthermore, the unit of UM transfer is a page (4k or more), which is too coarse for many applications + - DM: serialization of an entire object (which can leverage standard RDMA-like serialization), transfering partial state requires explicit annotation + - hence it makes sense to make representation of object's partial state (`View`) a first-class concept in both models. + +### Asynchrony +- *Continuations/stages*: decompose tasks into _continuations_ (stages), with runtime-managed scheduling of continutations for managing the asynchrony of the actions initiated by each continuation + - pro: most explicit, easier to reason about, fewest performance implications + - con: most verbose; device-capable tasks look very different from host tasks + - con: limited composability + - difficult to support general computation patterns (e.g. generator continuation, etc.,) +- *"Threads"*: use threads to deal with the asynchrony (in principle could use user-space threads = fibers) + - pro: least host/device dichotomy + - tasks are ordinary (synchronous) functions + - fully composable + - con: performance implications + - due to the need to context switch to "yield" to other tasks + - thus even fully synchronous computations will suffer + - con: asynchrony artifacts still appear + - asynchronous calls must be in general annotated (to force synchronous execution and/or to provide hints to the thread scheduler) +- *"Coroutines"*: use C++20 coroutines + - pro: less host/device dichotomy compared to continuations + - task functions "look" like ordinary functions (and can be made almost like normal functions using macros) but returning a custom return object (containing return status + handle to the coroutine) instead of void + - fully composable + - performance implications + - pro: no impact on synchronous tasks + - con: coroutine implementation details are complex and usually involve heap allocation + - pro: custom allocators can be introduced to elide heap allocation (at the cost of limited generality) + - con: asynchrony artifacts still appear + - co_await annotate the spots where execution may need to be suspended + - con: less mature due to the need for C++20 + - GCC (10+), LLVM (8+) support coroutines + - TTG and all of its dependencies will be impacted by the raised standard requirement + +### other considerations + +- it's not possible to manage memory from the device code, hence all program logic, including _device-capable_ tasks, must execute on host executors. In principle if we restricted ourselves to a single-source language (SYLC-extended C++) we could write device capable tasks directly as device code, but current language limitations mandate wrapping everything into host code. +- runtime is still responsible for managing the executor space heterogeneity (control where to launch a task) and asynchrony (events/host callbacks). + +## Current designs +- *UM+threads*: use UM for memory management + threads for asynchrony +- *DM+stages*: use Parsec's device memory pool manager + stage-decomposed tasks +- *?M+coroutines*: UM/DM for memory + C++20 coroutines for handling the asynchrony + +### Example code: threads vs continuations vs coroutines + +How should we map the following host task onto the device? +```cpp +make_tt([](auto& key, auto& data1, auto& data2) -> void { + double data3 = blap::dot(data1.data(), data2.data()); + if (data3 >= 0.) + send<0>(data1); + else + send<0>(data2); +} +``` + +Ideally the task will receive `data1` and `data2` already transferred to the memory space(s) accessible from the device execution space: +```cpp +make_device_tt([](auto& key, auto& data1, auto& data2) -> void { + double data3 = blap::device_dot(data1.data(), data2.data()); + if (data3 >= 0.) + send<0>(data1); + else + send<0>(data2); +} +``` +But now `data3` lives in the host memory so in general we must manage its transfer from the device. Hence either: +- all intermediate data must be managed explicitly within the task, or +- except for the cases where user types are aware of multiple memory spaces (but this makes the state of such types asynchronous). + +Here are the tentative device versions of this task in each of the 3 approaches (the memory details are omitted). + +#### Threads +```cpp +make_tt([](auto& key, auto& data1, auto& data2) -> void { + // stage 1 + ConstView view1(data1); + ConstView view2(data2); + double data3; + View view3(data3, NewView | SyncView_D2H); + // depending on the memory model may need to wait here for the transfers to complete + // could build the waits into View ctors, or need an explicit await() + + // stage 2 + cublasDdot(view1.device_ptr(), view2.device_ptr(), view3.device_ptr()); + // if called an async function need explicit await() here + // also: who/how will view3 be synchronized + + if (data3 >= 0.) + send<0>(data1); + else + send<0>(data2); +} +``` +N.B. `make_tt`: this is a regular task. + +#### Continuations +```cpp +make_device_tt( + // stage 1 + [](auto& key, auto& data1, auto& data2) { + ConstView view1(data1); + ConstView view2(data2); + double data3; + View view3(data3, NewView | SyncView_D2H); + return {view1, view2, view3}; + }, + // stage 2 + [](auto& key, auto& views) { + auto& [view1, view2, view3] = views; + cublasDdot(view1.device_ptr(), view2.device_ptr(), view3.device_ptr()); + }, + // stage 3 + [](auto& key, auto& views) { + auto& [view1, view2, view3] = views; + if (*view3.host_ptr() >= 0.) + send<0>(data1); + else + send<0>(data2); + } +} +``` +N.B. `make_device_tt` vs `make_tt`: this is a special task. + +#### Coroutines +```cpp +make_tt([](auto& key, auto& data1, auto& data2) -> ttg::resumable_task { + // stage 1 + ConstView view1(data1); + ConstView view2(data2); + double data3; + View view3(data3, NewView | SyncView_D2H); + co_await sync_views(view1, view2, view3); // creates list of transfers to be fulfilled by the runtime + + // stage 2 + cublasDdot(view1.device_ptr(), view2.device_ptr(), view3.device_ptr()); + co_await; // syncs view3; since transfers and kernels execute in different streams the runtime will sync kernel stream, then launch transfers, then resume here + + if (data3 >= 0.) + send<0>(data1); + else + send<0>(data2); + co_return; // processes sends and destroys coroutine +}, ...); +``` +N.B. `make_tt` and `ttg::resumable_task`: this is a regular task but with special return type. diff --git a/doc/Pull-terminal-design-doc.md b/doc/dox/contrib/Pull-terminal-design-doc.md similarity index 98% rename from doc/Pull-terminal-design-doc.md rename to doc/dox/contrib/Pull-terminal-design-doc.md index 377c42b221..1af4ed11b7 100644 --- a/doc/Pull-terminal-design-doc.md +++ b/doc/dox/contrib/Pull-terminal-design-doc.md @@ -1,4 +1,4 @@ -# Pull Terminals - Design Notes +# Pull Terminals Design Notes {#Design-Pull} ### Motivation @@ -49,7 +49,3 @@ - Should Pull Op be able to send data to multiple successors? Use cases? - Cholesky - why pull ops are needed? - - - - diff --git a/doc/dox/contrib/top.md b/doc/dox/contrib/top.md index 87fd1edd82..3cf224b2f4 100644 --- a/doc/dox/contrib/top.md +++ b/doc/dox/contrib/top.md @@ -1,7 +1,12 @@ # Contributor Guide {#contribguide} -* [TTG Build Intrastructure](@ref TTG-Build-Infrastructure) +* Development + - [TTG Build Intrastructure](@ref TTG-Build-Infrastructure) + - [Documenting TTG](@ref Documenting-TTG) + - [Recommended Workflow Elements](@ref Recommended-Workflow-Elements) + - [CodingStandard](@ref Coding-Standard) +* Design Notes + - [Pull Terminals](@ref Design-Pull) + - [Device Tasks](@ref Design-Device) +* Maintenance - [Managing Continuous Integration (CI)](@ref CI-Administration-Notes) -* [Documenting TTG](@ref Documenting-TTG) -* [Recommended Workflow Elements](@ref Recommended-Workflow-Elements) -* [CodingStandard](@ref Coding-Standard) diff --git a/examples/device_mock/device_mock.cc b/examples/device_mock/device_mock.cc new file mode 100644 index 0000000000..8722591e70 --- /dev/null +++ b/examples/device_mock/device_mock.cc @@ -0,0 +1,188 @@ +// clang-format off + +#include +#include +#include "../matrixtile.h" + +#include +#include +#include +#include +#include + +#include "ttg/util/meta.h" + +#include + +using Key2 = ttg::MultiIndex<2>; +using Key3 = ttg::MultiIndex<3>; + +/* number of tiles */ +#define KT 100 + +template +auto make_gemm(ttg::Edge>& A, + ttg::Edge>& B, + ttg::Edge>& output_result) +{ + + ttg::Edge> C; + auto f_cpu = [=](const Key3& key, + const MatrixTile& A, + const MatrixTile& B, + MatrixTile& C, + std::tuple>, + ttg::Out>>& out) + { + int m = key[0]; + int n = key[1]; + int k = key[2]; + + /* + if(k == 0) { + dlprng(C.data(), 1789, A.mb()*B.nb()); + } + dgemm(A.data(), A.mb(), A.nb(), + B.data(), B.mb(), B.nb(), + 1.0, + C.data(), C.nb(), C.nb()); + */ + + if( k == KT-1 || C.data()[0] < 1e-9 ) { + ttg::send<0>(Key2{m, n}, std::move(C)); + } else { + ttg::send<1>(Key3{m, n, k+1}, std::move(C)); + } + }; + + auto f_gpu_host_views = [=](const Key3& key, + const MatrixTile& A, + const MatrixTile& B, + MatrixTile&& C) + { + // The default ViewScope::SyncIn scope tells the runtime that the data should be copied + // to the device before the kernel callable is invoked. + ttg::View, const T> dev_A = ttg::make_view( A, ttg::ViewSpan(A.data(), A.size()) ); + ttg::View, const T> dev_B = ttg::make_view( B, ttg::ViewSpan(B.data(), B.size()) ); + ttg::View, T> dev_C; + ttg::View dev_tmp; + T *host_tmp = new(T); + // ViewScope::SyncOut tells the runtime system that the view should be synchronized back to the + // host before invoking the output callable. + dev_tmp = ttg::make_view( *host_tmp, ttg::ViewSpan(host_tmp, 1, ttg::ViewScope::SyncOut) ); + + int k = key[2]; + if(0 == k) { + // ViewScope::Allocate tells the runtime system that the device view needs to be allocated but doesn't need to be + // initialized with C.data(). However, C.data() is still associated with the device memory, so if the + // runtime system evicts that data from the device, it will be first copied back into C.data(). + dev_C = ttg::make_view( C, ttg::ViewSpan(C.data(), C.size(), ttg::ViewScope::Allocate) ); + } else { + dev_C = ttg::make_view( C, ttg::ViewSpan(C.data(), C.size()) ); + } + + return std::make_tuple(dev_A, dev_B, dev_C, dev_tmp); + }; + + auto f_gpu_kernel = [=](const Key3& key, + ttg::View, const T>& dev_A, + ttg::View, const T>& dev_B, + ttg::View, T>& dev_C, + ttg::View& dev_tmp) + { + int k = key[2]; + + const MatrixTile& A = dev_A.get_host_object(); + const MatrixTile& B = dev_B.get_host_object(); + MatrixTile& C = dev_C.get_host_object(); + T& host_tmp = dev_tmp.get_host_object(); + auto beta = 1.0; + if(k == 0) { + //cublasDplrng(dev_C.get(0), C.mb(), C.nb()); + } + + /* + cublasDgemm(dev_A.get(0), A.mb(), A.nb(), + dev_B.get(0), B.mb(), B.nb(), + beta, + dev_C.get(0), C.mb(), C.nb()); + + cudaMemcpyAsync(&dev_C.get(0)[0], host_tmp, sizeof(T), cudaDeviceToHost); + */ + }; + + auto f_gpu_output_flows = [=](const Key3& key, + const MatrixTile& A, + const MatrixTile& B, + MatrixTile& C, + T& host_tmp) + { + int m = key[0]; + int n = key[1]; + int k = key[2]; + + if( k == KT-1 || host_tmp < 1e-9 ) { + ttg::send<0>(Key2{m, n}, std::move(C)); + } else { + ttg::send<1>(Key3{m, n, k+1}, std::move(C)); + } + delete &host_tmp; + }; + + //ttg::meta::type_printer x; + + /* If we only have GPU */ + auto gemm_tt = ttg::make_device_tt(f_gpu_host_views, f_gpu_kernel, f_gpu_output_flows, ttg::ExecutionSpace::CUDA, + ttg::edges(A, B, C), ttg::edges(output_result, C), + "GEMM", {"A", "B", "C"}, {"output_result", "C"}); + +#if 0 + /* Alternative: to get both type of tasklets: */ + auto gemm_tt = ttg::make_device_tt(f_cpu, f_gpu_host_views, f_gpu_kernel, f_gpu_output_flows, ttg::ExecutionSpace::CUDA, + ttg::edges(A, B), ttg::edges(output_result, C), + "GEMM", {"A", "B"}, {"output_result", "C"}); +#endif + return gemm_tt; +} + +int main(int argc, char **argv) +{ + + std::chrono::time_point beg, end; + int N = 1024; + int M = N; + int NB = 128; + int check = 0; + int nthreads = -1; + const char* prof_filename = nullptr; + + if (argc > 1) { + N = M = atoi(argv[1]); + } + + if (argc > 2) { + NB = atoi(argv[2]); + } + + if (argc > 3) { + check = atoi(argv[3]); + } + + if (argc > 4) { + nthreads = atoi(argv[4]); + } + + ttg::initialize(argc, argv, nthreads); + + auto world = ttg::default_execution_context(); + + ttg::Edge> edge_a, edge_b; + ttg::Edge> edge_out; + + auto gemm_tt = make_gemm(edge_a, edge_b, edge_out); + + ttg::fence(); + + ttg::finalize(); + return 0; +} diff --git a/examples/madness/mrattg.cc b/examples/madness/mrattg.cc index 47bce1ce94..2b2d6c113b 100644 --- a/examples/madness/mrattg.cc +++ b/examples/madness/mrattg.cc @@ -124,6 +124,46 @@ auto make_project(functorT& f, return ttg::make_tt(F, edges(fuse(refine, ctl)), edges(refine, result), name, {"control"}, {"refine", "result"}); } + +/// Returns an std::unique_ptr to the object +template +auto make_project_device(functorT& f, + const T thresh, /// should be scalar value not complex + ctlEdge& ctl, rnodeEdge& result, const std::string& name = "project") { + auto F = [f, thresh](const Key& key, std::tuple, rnodeOut>& out) { + FunctionReconstructedNode node(key); // Our eventual result + auto& coeffs = node.coeffs; // Need to clean up OO design + bool is_leaf; + + if (key.level() < initial_level(f)) { + for (auto child : children(key)) ttg::sendk<0>(child, out); + coeffs = T(1e7); // set to obviously bad value to detect incorrect use + is_leaf = false; + } else if (is_negligible(f, Domain::template bounding_box(key), + truncate_tol(key, thresh))) { + coeffs = T(0.0); + is_leaf = true; + } else { + auto node_view = ttg::make_view(node, ttg::ViewScope::Out); // no need to move node onto the device + auto is_leaf_view = ttg::make_view(is_leaf, ttg::ViewScope::Out); + co_await ttg::device::wait_views{}; + fcoeffs(f, key, thresh, + node_view.get_device_ptr<0>(), + is_leaf_view.get_device_ptr<0>()); // cannot deduce K + co_await ttg::device::wait_kernel{}; + if (!is_leaf) { + for (auto child : children(key)) ttg::sendk<0>(child, out); // should be broadcast ? + } + } + node.is_leaf = is_leaf; + ttg::send<1>(key, node, out); // always produce a result + }; + ctlEdge refine("refine"); + return ttg::make_tt(F, edges(fuse(refine, ctl)), edges(refine, result), name, {"control"}, {"refine", "result"}); +} + + + namespace detail { template struct tree_types {}; diff --git a/examples/potrf/pmw.h b/examples/potrf/pmw.h index 14fc6ce5b1..076f1fffca 100644 --- a/examples/potrf/pmw.h +++ b/examples/potrf/pmw.h @@ -23,13 +23,13 @@ struct type2matrixtype template<> struct type2matrixtype { - static constexpr const matrix_type value = matrix_type::matrix_RealFloat; + static constexpr const parsec_matrix_type_t value = parsec_matrix_type_t::PARSEC_MATRIX_FLOAT; }; template<> struct type2matrixtype { - static constexpr const matrix_type value = matrix_type::matrix_RealDouble; + static constexpr const parsec_matrix_type_t value = parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE; }; template @@ -97,8 +97,8 @@ class PaRSECMatrixWrapper { } bool in_matrix(int row, int col) const { - return (pm->uplo == matrix_Lower && col <= row) || - (pm->uplo == matrix_Upper && col >= row); + return (pm->uplo == PARSEC_MATRIX_LOWER && col <= row) || + (pm->uplo == PARSEC_MATRIX_UPPER && col >= row); } PaRSECMatrixT* parsec() { diff --git a/examples/potrf/testing_dlauum.cc b/examples/potrf/testing_dlauum.cc index 0cd7c863bf..7176000943 100644 --- a/examples/potrf/testing_dlauum.cc +++ b/examples/potrf/testing_dlauum.cc @@ -6,6 +6,7 @@ #endif // TTG_USE_PARSEC #include +#include #include "lauum.h" #include "plgsy.h" @@ -71,10 +72,10 @@ int main(int argc, char **argv) std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl; - sym_two_dim_block_cyclic_t dcA; - sym_two_dim_block_cyclic_init(&dcA, matrix_type::matrix_RealDouble, - world.size(), world.rank(), NB, NB, N, M, - 0, 0, N, M, P, matrix_Lower); + parsec_matrix_sym_block_cyclic_t dcA; + parsec_matrix_sym_block_cyclic_init(&dcA, parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE, + world.rank(), NB, NB, N, M, + 0, 0, N, M, P, Q, PARSEC_MATRIX_LOWER); dcA.mat = parsec_data_allocate((size_t)dcA.super.nb_local_tiles * (size_t)dcA.super.bsiz * (size_t)parsec_datadist_getsizeoftype(dcA.super.mtype)); @@ -134,7 +135,7 @@ int main(int argc, char **argv) //delete A; /* cleanup allocated matrix before shutting down PaRSEC */ parsec_data_free(dcA.mat); dcA.mat = NULL; - parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA); + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA); world.dag_off(); world.profile_off(); @@ -142,3 +143,35 @@ int main(int argc, char **argv) ttg::finalize(); return ret; } + +static void +dplasma_dprint_tile( int m, int n, + const parsec_tiled_matrix_t* descA, + const double *M ) +{ + int tempmm = ( m == descA->mt-1 ) ? descA->m - m*descA->mb : descA->mb; + int tempnn = ( n == descA->nt-1 ) ? descA->n - n*descA->nb : descA->nb; + int ldam = BLKLDD( descA, m ); + + int ii, jj; + + fflush(stdout); + for(ii=0; ii +#include +#include #include "plgsy.h" #include "pmw.h" @@ -115,10 +117,15 @@ int main(int argc, char **argv) std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl; } - sym_two_dim_block_cyclic_t dcA; - sym_two_dim_block_cyclic_init(&dcA, matrix_type::matrix_RealDouble, - world.size(), world.rank(), NB, NB, N, M, - 0, 0, N, M, P, matrix_Lower); + parsec_matrix_sym_block_cyclic_t dcA; + parsec_matrix_sym_block_cyclic_init(&dcA, parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE, + world.rank(), + NB, NB, + N, M, + 0, 0, + N, M, + P, Q, + PARSEC_MATRIX_LOWER); dcA.mat = parsec_data_allocate((size_t)dcA.super.nb_local_tiles * (size_t)dcA.super.bsiz * (size_t)parsec_datadist_getsizeoftype(dcA.super.mtype)); @@ -387,7 +394,7 @@ int main(int argc, char **argv) //delete A; /* cleanup allocated matrix before shutting down PaRSEC */ parsec_data_free(dcA.mat); dcA.mat = NULL; - parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA); + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA); ttg::finalize(); return ret; diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc index 365a10ddce..d5686042fc 100644 --- a/examples/potrf/testing_dpotrf.cc +++ b/examples/potrf/testing_dpotrf.cc @@ -1,4 +1,5 @@ #include +#include #include "plgsy.h" #include "pmw.h" @@ -75,10 +76,10 @@ int main(int argc, char **argv) std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl; - sym_two_dim_block_cyclic_t dcA; - sym_two_dim_block_cyclic_init(&dcA, matrix_type::matrix_RealDouble, - world.size(), world.rank(), NB, NB, N, M, - 0, 0, N, M, P, matrix_Lower); + parsec_matrix_sym_block_cyclic_t dcA; + parsec_matrix_sym_block_cyclic_init(&dcA, parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE, + world.rank(), NB, NB, N, M, + 0, 0, N, M, P, Q, PARSEC_MATRIX_LOWER); dcA.mat = parsec_data_allocate((size_t)dcA.super.nb_local_tiles * (size_t)dcA.super.bsiz * (size_t)parsec_datadist_getsizeoftype(dcA.super.mtype)); @@ -202,7 +203,7 @@ int main(int argc, char **argv) /* cleanup allocated matrix before shutting down PaRSEC */ parsec_data_free(dcA.mat); dcA.mat = NULL; - parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA); + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA); world.profile_off(); @@ -210,6 +211,38 @@ int main(int argc, char **argv) return ret; } +static void +dplasma_dprint_tile( int m, int n, + const parsec_tiled_matrix_t* descA, + const double *M ) +{ + int tempmm = ( m == descA->mt-1 ) ? descA->m - m*descA->mb : descA->mb; + int tempnn = ( n == descA->nt-1 ) ? descA->n - n*descA->nb : descA->nb; + int ldam = BLKLDD( descA, m ); + + int ii, jj; + + fflush(stdout); + for(ii=0; ii +#include #include "plgsy.h" #include "pmw.h" @@ -92,10 +93,10 @@ int main(int argc, char **argv) std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl; - sym_two_dim_block_cyclic_t dcA; - sym_two_dim_block_cyclic_init(&dcA, matrix_type::matrix_RealDouble, - world.size(), world.rank(), NB, NB, N, M, - 0, 0, N, M, P, uplo == lapack::Uplo::Lower ? matrix_Lower : matrix_Upper); + parsec_matrix_sym_block_cyclic_t dcA; + parsec_matrix_sym_block_cyclic_init(&dcA, parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE, + world.rank(), NB, NB, N, M, + 0, 0, N, M, P, Q, uplo == lapack::Uplo::Lower ? PARSEC_MATRIX_LOWER : PARSEC_MATRIX_UPPER); dcA.mat = parsec_data_allocate((size_t)dcA.super.nb_local_tiles * (size_t)dcA.super.bsiz * (size_t)parsec_datadist_getsizeoftype(dcA.super.mtype)); @@ -222,7 +223,7 @@ int main(int argc, char **argv) } parsec_data_free(dcA.mat); dcA.mat = NULL; - parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA); + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA); world.dag_off(); world.profile_off(); @@ -231,6 +232,38 @@ int main(int argc, char **argv) return ret; } +static void +dplasma_dprint_tile( int m, int n, + const parsec_tiled_matrix_t* descA, + const double *M ) +{ + int tempmm = ( m == descA->mt-1 ) ? descA->m - m*descA->mb : descA->mb; + int tempnn = ( n == descA->nt-1 ) ? descA->n - n*descA->nb : descA->nb; + int ldam = BLKLDD( descA, m ); + + int ii, jj; + + fflush(stdout); + for(ii=0; ii>>(buffer, scratch); + +} + +#endif // TTG_HAVE_CUDA \ No newline at end of file diff --git a/tests/unit/cuda_kernel.h b/tests/unit/cuda_kernel.h new file mode 100644 index 0000000000..4fec87a999 --- /dev/null +++ b/tests/unit/cuda_kernel.h @@ -0,0 +1,4 @@ +#include "ttg/config.h" +#include + +void increment_buffer(double* buffer, std::size_t buffer_size, double* scratch, std::size_t scratch_size); \ No newline at end of file diff --git a/tests/unit/device_coro.cc b/tests/unit/device_coro.cc new file mode 100644 index 0000000000..5b51b0724d --- /dev/null +++ b/tests/unit/device_coro.cc @@ -0,0 +1,220 @@ +#include + +#include "ttg.h" +#include "ttg/view.h" + +#include "cuda_kernel.h" + +struct value_t { + ttg::buffer db; // TODO: rename + int quark; + + template + void ttg_serialize(Archive& ar) { + ar& quark; + ar& db; // input: + } +}; + +/* devicebuf is non-POD so provide serialization + * information for members not a devicebuf */ +namespace madness::archive { + template + struct ArchiveSerializeImpl { + static inline void serialize(const Archive& ar, value_t& obj) { ar& obj.quark & obj.db; }; + }; +} // namespace madness::archive + + +TEST_CASE("Device", "coro") { + + SECTION("devicebuf") { + + ttg::Edge edge; + auto fn = [&](const int& key, value_t&& val) -> ttg::device_task { + ttg::print("device_task key ", key); + /* wait for the view to be available on the device */ + co_await ttg::to_device(val.db); + /* once we're back here the data has been transferred */ + CHECK(val.db.current_device_ptr() != nullptr); + + /* NO KERNEL */ + + /* here we suspend to wait for a kernel to complete */ + co_await ttg::wait_kernel(); + + /* we're back, the kernel executed and we can send */ + if (key < 10) { + /* TODO: should we move the view in here if we want to get the device side data */ + //ttg::send<0>(key+1, std::move(val)); + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val))); + } + }; + + //ptr.get_view(device_id); + + auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge), + "device_task", {"edge_in"}, {"edge_out"}); + make_graph_executable(tt); + if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{}); + ttg::ttg_fence(ttg::default_execution_context()); + } + + SECTION("scratch") { + + ttg::Edge edge; + auto fn = [&](const int& key, value_t&& val) -> ttg::device_task { + double scratch = 0.0; + ttg::devicescratch ds = ttg::make_scratch(&scratch, ttg::scope::Allocate); + + /* wait for the view to be available on the device */ + co_await ttg::to_device(ds, val.db); + /* once we're back here the data has been transferred */ + CHECK(ds.device_ptr() != nullptr); + + /* call a kernel */ +#ifdef TTG_HAVE_CUDA + increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size()); +#endif // TTG_HAVE_CUDA + + /* here we suspend to wait for a kernel to complete */ + co_await ttg::wait_kernel(ds); + +#ifdef TTG_HAVE_CUDA + /* buffer is increment once per task, so it should be the same as key */ + CHECK((static_cast(scratch)-1) == key); +#endif // 0 + + /* we're back, the kernel executed and we can send */ + if (key < 10) { + /* TODO: should we move the view in here if we want to get the device side data */ + //ttg::send<0>(key+1, std::move(val)); + /* NOTE: we use co_await here instead of co_return because co_return destroys all local variables first; + * we will not return from this co_await!*/ + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val))); + } + }; + + auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge), + "device_task", {"edge_in"}, {"edge_out"}); + make_graph_executable(tt); + if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{}); + ttg::ttg_fence(ttg::default_execution_context()); + } + + SECTION("ptr") { + + ttg::Edge edge; + ttg::Ptr ptr; + int last_key = 0; + auto fn = [&](const int& key, value_t&& val) -> ttg::device_task { + double scratch = 1.0; + ttg::devicescratch ds = ttg::make_scratch(&scratch, ttg::scope::SyncIn); + + /* wait for the view to be available on the device */ + co_await ttg::to_device(ds, val.db); + /* once we're back here the data has been transferred */ + CHECK(ds.device_ptr() != nullptr); + + /* KERNEL */ +#ifdef TTG_HAVE_CUDA + increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size()); +#endif // TTG_HAVE_CUDA + + /* here we suspend to wait for a kernel and the out-transfer to complete */ + co_await ttg::wait_kernel(val.db, ds); + +#ifdef TTG_HAVE_CUDA + /* buffer is increment once per task, so it should be the same as key */ + CHECK(static_cast(scratch) == key+1); + CHECK(static_cast(*val.db.host_ptr()) == key+1); +#endif // TTG_HAVE_CUDA + + /* we're back, the kernel executed and we can send */ + if (key < 10 || scratch < 0.0) { + //ttg::send<0>(key+1, std::move(val)); + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val))); + } else { + /* exfiltrate the value */ + /* TODO: what consistency do we expect from get_ptr? */ + ptr = ttg::get_ptr(val); + last_key = key; + } + }; + + //ptr.get_view(device_id); + + auto tt = ttg::make_tt(fn, ttg::edges(edge), ttg::edges(edge), + "device_task", {"edge_in"}, {"edge_out"}); + make_graph_executable(tt); + if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{}); + ttg::ttg_fence(ttg::default_execution_context()); + CHECK(ptr.is_valid()); + + /* feed the ptr back into a graph */ + if (ttg::default_execution_context().rank() == 0) tt->invoke(last_key+1, ptr); + ttg::ttg_fence(ttg::default_execution_context()); + + ptr.reset(); + } + +#if 0 + /* TODO: enabel this test once we control the PaRSEC state machine! */ + SECTION("device_host_tasks") { + + ttg::Edge h2d, d2h; + + auto host_fn = [&](const int& key, value_t&& val) { + /* check that the data has been synced back */ + CHECK(static_cast(*val.db.host_ptr()) == key); + + /* modify the data */ + *val.db.host_ptr() += 1.0; + CHECK(static_cast(*val.db.host_ptr()) == key+1); + + /* send back to the device */ + ttg::send<0>(key+1, std::move(val)); + }; + auto htt = ttg::make_tt(host_fn, ttg::edges(d2h), ttg::edges(h2d), + "host_task", {"d2h"}, {"h2d"}); + + auto device_fn = [&](const int& key, value_t&& val) -> ttg::device_task { + double scratch = 0.0; + ttg::devicescratch ds = ttg::make_scratch(&scratch, ttg::scope::SyncOut); + + /* wait for the view to be available on the device */ + co_await ttg::to_device(ds, val.db); + /* once we're back here the data has been transferred */ + CHECK(ds.device_ptr() != nullptr); + + /* call a kernel */ +#ifdef TTG_HAVE_CUDA + increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size()); +#endif // TTG_HAVE_CUDA + + /* here we suspend to wait for a kernel to complete */ + co_await ttg::wait_kernel(); + +#ifdef TTG_HAVE_CUDA + /* buffer is increment once per task, so it should be the same as key */ + CHECK((static_cast(scratch)-1) == key); +#endif // 0 + + /* we're back, the kernel executed and we can send */ + if (key < 10) { + /* TODO: should we move the view in here if we want to get the device side data */ + ttg::send<0>(key+1, std::move(val)); + } + }; + + auto dtt = ttg::make_tt(device_fn, ttg::edges(h2d), ttg::edges(d2h), + "device_task", {"h2d"}, {"d2h"}); + make_graph_executable(dtt); + if (ttg::default_execution_context().rank() == 0) htt->invoke(0, value_t{}); + ttg::ttg_fence(ttg::default_execution_context()); + } + + +#endif // 0 + +} \ No newline at end of file diff --git a/tests/unit/fibonacci-coro.cc b/tests/unit/fibonacci-coro.cc new file mode 100644 index 0000000000..e8ed5d1ba2 --- /dev/null +++ b/tests/unit/fibonacci-coro.cc @@ -0,0 +1,122 @@ +#include + +#include "ttg.h" + +#include "ttg/serialization/std/pair.h" +#include "ttg/util/hash/std/pair.h" + +#include "ttg/util/coroutine.h" + +constexpr int64_t N = 1000; + +TEST_CASE("Fibonacci-coroutines", "[fib][core]") { + // compute the reference result + int reference_result = 0; + { + // recursive lambda pattern from http://pedromelendez.com/blog/2015/07/16/recursive-lambdas-in-c14/ + auto compute_reference_result = [&reference_result](int f_np1, int f_n) { + auto impl = [&reference_result](int f_np1, int f_n, const auto &impl_ref) -> void { + assert(f_n < N); + reference_result += f_n; + if (f_np1 < N) { + const auto f_np2 = f_np1 + f_n; + impl_ref(f_np2, f_np1, impl_ref); + } + }; + impl(f_np1, f_n, impl); + }; + compute_reference_result(1, 0); + } + + SECTION("shared-memory") { + if (ttg::default_execution_context().size() == 1) { + ttg::Edge F2F; + ttg::Edge F2P; + + // N.B. wrap a trivial (nonsuspending) coroutine using make_tt! + auto fib_op = ttg::make_tt( + // computes next value: F_{n+2} = F_{n+1} + F_{n}, seeded by F_1 = 1, F_0 = 0 + // N.B. can't autodeduce return type, must explicitly declare the return type + [](const int &F_n_plus_1, const int &F_n) -> ttg::resumable_task { + // on 1 process the right order of sends can avoid the race iff reductions are inline (on-current-thread) + // and not async (nthread>1): + // - send<1> will call wc->set_arg which will eagerly reduce the argument + // - send<0> then will call wa->set_arg which will create task for key F_np2 ... that can potentially call + // finalize<1> in the other clause + // - reversing the order of sends will create a race between wc->set_arg->send<1> executing on this thread + // and wa->set_arg->finalize<1> executing in thread pool + // - there is no way to detect the "undesired" outcome of the race without keeping expired TTArgs from the + // cache there is no way currently to avoid race if there is more than 1 process ... need to track the + // number of messages that the reducing terminal will receive, that's what distributed example demonstrates. + // The order of operations will still matter. + if (F_n_plus_1 < N) { + const auto F_n_plus_2 = F_n_plus_1 + F_n; + // cool, if there are no events to wait for co_await is no-op + co_await ttg::resumable_task_events{}; + ttg::sendv<1>(F_n_plus_1); + ttg::send<0>(F_n_plus_2, F_n_plus_1); + } else + ttg::finalize<1>(); + + // to test coro-based task lifecycle introduce fake events + ttg::event null_event; + co_await ttg::resumable_task_events{null_event}; + + // N.B. return void just as normal TT op + co_return; + }, + ttg::edges(F2F), ttg::edges(F2F, F2P)); + auto print_op = ttg::make_tt( + [reference_result](const int &value, std::tuple<> &out) { + ttg::print("sum of Fibonacci numbers up to ", N, " = ", value); + CHECK(value == reference_result); + }, + ttg::edges(F2P), ttg::edges()); + print_op->set_input_reducer<0>([](int &a, const int &b) { a = a + b; }); + make_graph_executable(fib_op); + if (ttg::default_execution_context().rank() == 0) fib_op->invoke(1, 0); + ttg::ttg_fence(ttg::default_execution_context()); + } + } + + // in distributed memory we must count how many messages the reducer will receive + SECTION("distributed-memory") { + ttg::Edge> F2F; + ttg::Edge F2P; + const auto nranks = ttg::default_execution_context().size(); + + auto fib_op = ttg::make_tt( + // computes next value: F_{n+2} = F_{n+1} + F_{n}, seeded by F_1 = 1, F_0 = 0 + [](const int &n, const std::pair &F_np1_n) { + const auto &[F_n_plus_1, F_n] = F_np1_n; + if (F_n_plus_1 < N) { + const auto F_n_plus_2 = F_n_plus_1 + F_n; + ttg::print("sent ", F_n_plus_1, " to fib reducer"); + ttg::sendv<1>(F_n_plus_1); + ttg::send<0>(n + 1, std::make_pair(F_n_plus_2, F_n_plus_1)); + } else { + // how many messages the reducer should expect to receive + ttg::set_size<1>(n); + ttg::print("fib reducer will expect ", n, " messages"); + } + }, + ttg::edges(F2F), ttg::edges(F2F, F2P)); + auto print_op = ttg::make_tt( + [reference_result](const int &value, std::tuple<> &out) { + ttg::print("sum of Fibonacci numbers up to ", N, " = ", value); + CHECK(value == reference_result); + }, + ttg::edges(F2P), ttg::edges()); + // move all fib tasks to last rank, all reductions will happen on 0 => for some reason no reductions occur! + fib_op->set_keymap([=](const auto &key) { return nranks - 1; }); + fib_op->set_trace_instance(true); + print_op->set_input_reducer<0>([](int &a, const int &b) { + ttg::print("fib reducer: current value = ", a, ", incremented by ", b, " set to ", a + b); + a = a + b; + }); + make_graph_executable(fib_op); + ttg::ttg_fence(ttg::default_execution_context()); + if (ttg::default_execution_context().rank() == 0) fib_op->invoke(0, std::make_pair(1, 0)); + ttg::ttg_fence(ttg::default_execution_context()); + } +} // TEST_CAST("Fibonacci") diff --git a/tests/unit/tt.cc b/tests/unit/tt.cc index f77a483ef9..def14684a5 100644 --- a/tests/unit/tt.cc +++ b/tests/unit/tt.cc @@ -146,7 +146,7 @@ namespace tt_i_iv { template void func0(K &key, D1 &datum1, D2 &&datum2) { - abort(); + ttg::abort(); } } // namespace tt_i_iv @@ -298,6 +298,9 @@ TEST_CASE("TemplateTask", "[core]") { // OK: all of {auto&&, auto&, const auto&} bind to const T& static_assert(std::is_invocable &>::value); + static_assert(std::is_same_v &>, + void>); // OK: ditto static_assert(std::is_void_v(), std::declval(), std::declval()))>); @@ -323,14 +326,15 @@ TEST_CASE("TemplateTask", "[core]") { ttg::typelist, ttg::typelist, ttg::typelist, ttg::typelist, ttg::typelist, ttg::typelist &>>{})), - ttg::typelist<>>); + ttg::typelist, ttg::typelist<>>>); static_assert( std::is_same_v< decltype(compute_arg_binding_types( func0, ttg::typelist, ttg::typelist, ttg::typelist, ttg::typelist, ttg::typelist, ttg::typelist &>>{})), - ttg::typelist &>>); + ttg::typelist, + ttg::typelist &>>>); // voids are skipped static_assert( std::is_same_v< @@ -339,7 +343,8 @@ TEST_CASE("TemplateTask", "[core]") { ttg::typelist, ttg::typelist, ttg::typelist, ttg::typelist, ttg::typelist &>, ttg::typelist>{})), - ttg::typelist &, void>>); + ttg::typelist, ttg::typelist &, void>>>); // test introspection of generic arguments by the runtime (i.e. contents of TT::input_args_type) and // the deduced types inside the function body @@ -425,10 +430,10 @@ TEST_CASE("TemplateTask", "[core]") { static_assert(ttg::meta::is_generic_callable_v); auto [f_is_generic, f_args_t_v] = ttg::meta::callable_args; CHECK(!f_is_generic); - static_assert(std::is_same_v>); + static_assert(std::is_same_v, ttg::typelist>>); auto [g_is_generic, g_args_t_v] = ttg::meta::callable_args; CHECK(g_is_generic); - static_assert(std::is_same_v>); + static_assert(std::is_same_v, ttg::typelist<>>>); { static_assert(!ttg::meta::is_generic_callable_v); diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 8d2213e75c..5830147dd0 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -17,6 +17,7 @@ set(ttg-util-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/future.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/hash.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/hash/std/pair.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/iovec.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/macro.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/meta.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/meta/callable.h @@ -37,14 +38,22 @@ set(ttg-base-headers file(GLOB_RECURSE ttg-external-headers $<$:CONFIGURE_DEPENDS> ${CMAKE_CURRENT_SOURCE_DIR}/ttg/external/boost/* ) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/config.in.h + ${CMAKE_CURRENT_BINARY_DIR}/ttg/config.h +) set(ttg-impl-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/broadcast.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/buffer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescope.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescratch.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/edge.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/execution.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/func.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/fwd.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/impl_selector.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/tt.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/ptr.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/reduce.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/run.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/runtimes.h @@ -53,6 +62,9 @@ set(ttg-impl-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/traverse.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/world.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/make_tt.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/make_device_tt.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/view.h + ${CMAKE_CURRENT_BINARY_DIR}/ttg/config.h ) set(ttg-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg.h @@ -68,6 +80,7 @@ set(ttg-sources ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/env.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/version.cc ) + # extract git metadata include(GetGitMetadata) vgkit_cmake_git_metadata() @@ -75,16 +88,13 @@ vgkit_cmake_git_metadata() set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/version.cc PROPERTIES COMPILE_DEFINITIONS "TTG_GIT_REVISION=\"${TTG_GIT_REVISION}\";TTG_GIT_DESCRIPTION=\"${TTG_GIT_DESCRIPTION}\"") -set(ttg-public-headers ${ttg-headers};${ttg-impl-headers};${ttg-base-headers};${ttg-util-headers}) -if (NOT TTG_IGNORE_BUNDLED_EXTERNALS) - list(APPEND ttg-sources ${ttg-external-headers}) - list(APPEND ttg-public-headers ${ttg-external-headers}) -endif() + # optional dependencies if (TARGET Boost::boost) list(APPEND ttg-deps Boost::boost) else () # if Boost::boost is missing must use bundled Boost.CallableTraits list(APPEND ttg-defs "$") + list(APPEND ttg-incs ttg/external/) endif () if (TARGET TTG_Libunwind) list(APPEND ttg-deps TTG_Libunwind) @@ -92,6 +102,23 @@ endif(TARGET TTG_Libunwind) if (TTG_ENABLE_TRACE) list(APPEND ttg-defs "TTG_ENABLE_TRACE=1") endif (TTG_ENABLE_TRACE) +if (TARGET std::coroutine) + list(APPEND ttg-deps std::coroutine) + list(APPEND ttg-defs "TTG_HAS_COROUTINE=1") + list(APPEND ttg-util-headers + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/coroutine.h + ) +endif () +if (TARGET CUDA::cudart) + list(APPEND ttg-deps CUDA::cudart) + list(APPEND ttg-defs "TTG_HAVE_CUDART=1") +endif (TARGET CUDA::cudart) + +set(ttg-public-headers ${ttg-headers};${ttg-impl-headers};${ttg-base-headers};${ttg-util-headers}) +if (NOT TTG_IGNORE_BUNDLED_EXTERNALS) + list(APPEND ttg-sources ${ttg-external-headers}) + list(APPEND ttg-public-headers ${ttg-external-headers}) +endif() add_ttg_library(ttg "${ttg-sources}" PUBLIC_HEADER "${ttg-public-headers}" LINK_LIBRARIES "${ttg-deps}" INCLUDE_DIRECTORIES "${ttg-incs}" COMPILE_DEFINITIONS "${ttg-defs}") @@ -184,8 +211,13 @@ endif(TARGET MADworld) ######################## if (TARGET PaRSEC::parsec) set(ttg-parsec-headers + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/buffer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/devicescratch.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/fwd.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/import.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ptr.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/task.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/thread_local.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttg.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttg_data_copy.h ) diff --git a/ttg/ttg.h b/ttg/ttg.h index e0fa9a7029..09ccfaf902 100644 --- a/ttg/ttg.h +++ b/ttg/ttg.h @@ -27,6 +27,11 @@ #include "ttg/edge.h" +#include "ttg/ptr.h" +#include "ttg/buffer.h" +#include "ttg/devicescratch.h" +#include "ttg/devicescope.h" + #if defined(TTG_USE_PARSEC) #include "ttg/parsec/ttg.h" #elif defined(TTG_USE_MADNESS) diff --git a/ttg/ttg/base/tt.h b/ttg/ttg/base/tt.h index e59db15e67..4fa975b40e 100644 --- a/ttg/ttg/base/tt.h +++ b/ttg/ttg/base/tt.h @@ -144,11 +144,11 @@ namespace ttg { virtual ~TTBase() = default; /// Use this to create a task that takes no data "manually" - /// @warning calls std::abort() if the derived class TT did not override this; + /// @warning calls ttg::abort() if the derived class TT did not override this; /// only makes sense to override this if the derived TT uses void for key or data virtual void invoke() { std::cerr << "TTBase::invoke() invoked on a TT that did not override it" << std::endl; - abort(); + ttg::abort(); } /// Sets trace for all operations to value and returns previous setting. diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h new file mode 100644 index 0000000000..1868b7e0c8 --- /dev/null +++ b/ttg/ttg/buffer.h @@ -0,0 +1,31 @@ +#ifndef TTG_BUFFER_H +#define TTG_BUFFER_H + +#include +#include "ttg/impl_selector.h" + + +namespace ttg { + +template +using buffer = TTG_IMPL_NS::buffer; + +namespace detail { + template + struct is_buffer : std::false_type + { }; + + template + struct is_buffer> : std::true_type + { }; + + template + constexpr bool is_buffer_v = is_buffer::value; + + static_assert(is_buffer_v>); + static_assert(is_buffer_v>); +} // namespace detail + +} // namespace ttg + +#endif // TTG_buffer_H \ No newline at end of file diff --git a/ttg/ttg/config.in.h b/ttg/ttg/config.in.h new file mode 100644 index 0000000000..8f208c6115 --- /dev/null +++ b/ttg/ttg/config.in.h @@ -0,0 +1,16 @@ +// +// Created by Eduard Valeyev on 10/31/22. +// + +#ifndef TTG_CONFIG_IN_H +#define TTG_CONFIG_IN_H + +/** the C++ header containing the coroutine API */ +#define TTG_CXX_COROUTINE_HEADER <@CXX_COROUTINE_HEADER@> + +/** the C++ namespace containing the coroutine API */ +#define TTG_CXX_COROUTINE_NAMESPACE @CXX_COROUTINE_NAMESPACE@ + +#cmakedefine TTG_HAVE_CUDA + +#endif // TTG_CONFIG_IN_H diff --git a/ttg/ttg/device.h b/ttg/ttg/device.h new file mode 100644 index 0000000000..d1338bd222 --- /dev/null +++ b/ttg/ttg/device.h @@ -0,0 +1,14 @@ +#ifndef TTG_DEVICE_H +#define TTG_DEVICE_H + +#include "ttg/fwd.h" +#include "ttg/execution.h" + +namespace ttg { + namespace device { + using DeviceAllocator = TTG_IMPL_NS::device::DeviceAllocator; + std::size_t nb_devices() { return TTG_IMPL_NS::device::nb_devices(); } + } +} + +#endif /* TTG_DEVICE_H */ \ No newline at end of file diff --git a/ttg/ttg/device/send.h b/ttg/ttg/device/send.h new file mode 100644 index 0000000000..d125c87f2c --- /dev/null +++ b/ttg/ttg/device/send.h @@ -0,0 +1,26 @@ +#ifndef TTG_DEVICE_SEND_H +#define TTG_DEVICE_SEND_H + + +namespace ttg { + + namespace detail { + + /* structure holding references to the key and value, awaitable */ + template + struct await_send { + const Key& m_key; + Value& m_value; + + await_send(const Key& key, Value& val) + : m_key(key) + , m_value(val) + { } + }; + + } // namespace ttg + +} // namespace ttg + + +#endif // TTG_DEVICE_SEND_H \ No newline at end of file diff --git a/ttg/ttg/devicescope.h b/ttg/ttg/devicescope.h new file mode 100644 index 0000000000..594e6db0b3 --- /dev/null +++ b/ttg/ttg/devicescope.h @@ -0,0 +1,11 @@ +#ifndef TTG_DEVICESCOPE_H +#define TTG_DEVICESCOPE_H + +namespace ttg { + enum class scope { + Allocate = 0x0, //< memory allocated as scratch, but not moved in or out + SyncIn = 0x2, //< memory allocated as scratch and data transferred to device + }; +} // namespace ttg + +#endif // TTG_DEVICESCOPE_H \ No newline at end of file diff --git a/ttg/ttg/devicescratch.h b/ttg/ttg/devicescratch.h new file mode 100644 index 0000000000..37510436fd --- /dev/null +++ b/ttg/ttg/devicescratch.h @@ -0,0 +1,34 @@ +#ifndef TTG_DEVICESCRATCH_H +#define TTG_DEVICESCRATCH_H + +#include "ttg/devicescope.h" +#include "ttg/impl_selector.h" + +namespace ttg { + +template +using devicescratch = TTG_IMPL_NS::devicescratch; + +template +auto make_scratch(T* val, ttg::scope scope, std::size_t count = 1) { + return devicescratch(val, scope, 1); +} + +namespace detail { + + template + struct is_devicescratch : std::false_type + { }; + + template + struct is_devicescratch> : std::true_type + { }; + + template + constexpr bool is_devicescratch_v = is_devicescratch::value; + +} // namespace detail + +} // namespace ttg + +#endif // TTG_DEVICESCRATCH_H \ No newline at end of file diff --git a/ttg/ttg/func.h b/ttg/ttg/func.h index 61615e6314..f647734dd1 100644 --- a/ttg/ttg/func.h +++ b/ttg/ttg/func.h @@ -515,7 +515,7 @@ namespace ttg { template inline void set_size(const std::size_t size) { - set_size(size); + set_size(i, size); } /// \brief Finalize streaming input terminals connecting to the given output terminal for tasks @@ -570,7 +570,7 @@ namespace ttg { template inline void finalize() { - finalize(); + finalize(i); } } // namespace ttg diff --git a/ttg/ttg/fwd.h b/ttg/ttg/fwd.h index df32505d04..f9b8d1c0f8 100644 --- a/ttg/ttg/fwd.h +++ b/ttg/ttg/fwd.h @@ -47,6 +47,7 @@ namespace ttg { template void initialize(int argc, char **argv, int num_threads = -1, RestOfArgs &&...); void finalize(); + [[noreturn]] void abort(); World default_execution_context(); void execute(ttg::World world); diff --git a/ttg/ttg/madness/fwd.h b/ttg/ttg/madness/fwd.h index abcb771c54..469a251d12 100644 --- a/ttg/ttg/madness/fwd.h +++ b/ttg/ttg/madness/fwd.h @@ -45,6 +45,11 @@ namespace ttg_madness { template inline void ttg_broadcast(ttg::World world, T &data, int source_rank); + namespace device { + class DeviceAllocator; + std::size_t nb_devices(); + } + } // namespace ttg_madness #endif // TTG_MADNESS_FWD_H diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h index 58373d5edd..6cd484d4a1 100644 --- a/ttg/ttg/madness/ttg.h +++ b/ttg/ttg/madness/ttg.h @@ -22,7 +22,11 @@ #include "ttg/util/meta.h" #include "ttg/util/meta/callable.h" #include "ttg/util/void.h" +#include "ttg/view.h" #include "ttg/world.h" +#ifdef TTG_HAS_COROUTINE +#include "ttg/util/coroutine.h" +#endif #include #include @@ -273,6 +277,9 @@ namespace ttg_madness { derivedT *derived; // Pointer to derived class instance bool pull_terminals_invoked = false; std::conditional_t, ttg::Void, keyT> key; // Task key +#ifdef TTG_HAS_COROUTINE + void *suspended_task_address = nullptr; // if not null the function is suspended +#endif /// makes a tuple of references out of tuple of template @@ -297,28 +304,83 @@ namespace ttg_madness { } virtual void run(::madness::World &world) override { - // ttg::print("starting task"); - using ttg::hash; ttT::threaddata.key_hash = hash{}(key); ttT::threaddata.call_depth++; - if constexpr (!ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { - derived->op(key, this->make_input_refs(), - derived->output_terminals); // !!! NOTE converting input values to refs - } else if constexpr (!ttg::meta::is_void_v && ttg::meta::is_empty_tuple_v) { - derived->op(key, derived->output_terminals); - } else if constexpr (ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { - derived->op(this->make_input_refs(), - derived->output_terminals); // !!! NOTE converting input values to refs - } else if constexpr (ttg::meta::is_void_v && ttg::meta::is_empty_tuple_v) { - derived->op(derived->output_terminals); - } else - abort(); + void *suspended_task_address = +#ifdef TTG_HAS_COROUTINE + this->suspended_task_address; // non-null = need to resume the task +#else + nullptr; +#endif + if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function + // ttg::print("starting task"); + if constexpr (!ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { + TTG_PROCESS_TT_OP_RETURN( + suspended_task_address, + derived->op(key, this->make_input_refs(), + derived->output_terminals)); // !!! NOTE converting input values to refs + } else if constexpr (!ttg::meta::is_void_v && ttg::meta::is_empty_tuple_v) { + TTG_PROCESS_TT_OP_RETURN(suspended_task_address, derived->op(key, derived->output_terminals)); + } else if constexpr (ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { + TTG_PROCESS_TT_OP_RETURN( + suspended_task_address, + derived->op(this->make_input_refs(), + derived->output_terminals)); // !!! NOTE converting input values to refs + } else if constexpr (ttg::meta::is_void_v && ttg::meta::is_empty_tuple_v) { + TTG_PROCESS_TT_OP_RETURN(suspended_task_address, derived->op(derived->output_terminals)); + } else // unreachable + ttg::abort(); + } else { // resume suspended coroutine +#ifdef TTG_HAS_COROUTINE + auto ret = static_cast(ttg::coroutine_handle<>::from_address(suspended_task_address)); + assert(ret.ready()); + ret.resume(); + if (ret.completed()) { + ret.destroy(); + suspended_task_address = nullptr; + } else { // not yet completed + // leave suspended_task_address as is + } + this->suspended_task_address = suspended_task_address; +#else + ttg::abort(); // should not happen +#endif + } ttT::threaddata.call_depth--; - // ttg::print("finishing task",ttT::threaddata.call_depth); + // if (suspended_task_address == nullptr) { + // ttg::print("finishing task",ttT::threaddata.call_depth); + // } + +#ifdef TTG_HAS_COROUTINE + if (suspended_task_address) { + // TODO implement handling of suspended coroutines properly + + // right now can events are not properly implemented, we are only testing the workflow with dummy events + // so mark the events finished manually, parsec will rerun this task again and it should complete the second + // time + auto events = + static_cast(ttg::coroutine_handle<>::from_address(suspended_task_address)).events(); + for (auto &event_ptr : events) { + event_ptr->finish(); + } + assert(ttg::coroutine_handle<>::from_address(suspended_task_address).promise().ready()); + + // resume the coroutine + auto ret = static_cast(ttg::coroutine_handle<>::from_address(suspended_task_address)); + assert(ret.ready()); + ret.resume(); + if (ret.completed()) { + ret.destroy(); + suspended_task_address = nullptr; + } else { // not yet completed + ttg::abort(); + } + } +#endif // TTG_HAS_COROUTINE } virtual ~TTArgs() {} // Will be deleted via TaskInterface* @@ -564,7 +626,7 @@ namespace ttg_madness { } else if constexpr (ttg::meta::is_void_v && ttg::meta::is_empty_tuple_v) { static_cast(this)->op(output_terminals); // Runs immediately } else - abort(); + ttg::abort(); ttT::threaddata.call_depth--; } else { @@ -944,7 +1006,7 @@ namespace ttg_madness { auto finalize_callback = [this]() { finalize_argstream(); }; input.set_callback(send_callback, send_callback, {}, setsize_callback, finalize_callback); } else - abort(); + ttg::abort(); } template @@ -1070,7 +1132,7 @@ namespace ttg_madness { for (std::size_t i = 0; i < numins; i++) std::cerr << (item.second->nargs[i] == 0 ? "T" : "F") << " "; std::cerr << ")" << std::endl; } - abort(); + ttg::abort(); } } @@ -1215,6 +1277,28 @@ namespace ttg_madness { #include "ttg/make_tt.h" + namespace device { + class DeviceAllocator { + public: + DeviceAllocator(int did) { + if(did != 0) { + throw std::out_of_range("TTG MADNESS Backend: current implementation only supports CPU devices") + } + } + void *allocate(std::size_t size) { + return ::malloc(size); + }; + void free(void *ptr) { + ::free(ptr); + } + ::ttg::ExecutionSpace executionSpace() { + return ::ttg::ExecutionSpace::Host; + } + }; + + std::size_t nb_devices() { return 1; } + } + } // namespace ttg_madness #include "ttg/madness/watch.h" diff --git a/ttg/ttg/make_device_tt.h b/ttg/ttg/make_device_tt.h new file mode 100644 index 0000000000..09f096d838 --- /dev/null +++ b/ttg/ttg/make_device_tt.h @@ -0,0 +1,281 @@ +// clang-format off +#ifndef TTG_MAKE_DEVICE_TT_H +#define TTG_MAKE_DEVICE_TT_H + +// to be #include'd within runtime::ttg namespace + + +namespace detail { +#ifdef TTG_HAVE_CUDA + inline thread_local cudaStream_t* ts_stream = nullptr; +#endif // TTG_HAVE_CUDA + + template + inline void invoke_with_unpacked_views(FuncT&& func, const keyT& key, std::tuple& views, std::index_sequence) { +#ifdef TTG_HAVE_CUDA + func(key, std::get(views)..., ts_stream); +#else // TTG_HAVE_CUDA + func(key, std::get(views)...); +#endif // TTG_HAVE_CUDA + } + + /* TODO: extract host objects from views */ + template + struct host_obj_type; + + template + struct host_obj_type> { + using type = std::tuple; + }; + + template + using host_obj_type_t = typename host_obj_type::type; + + template + inline void invoke_out_with_unpacked_views(FuncT&& func, const keyT& key, std::tuple views, std::index_sequence) { + func(key, std::get(views).get_host_object()...); + } + + template + inline void create_view_on_device(const ttg::View& view, + std::tuple...>& dev_spans, + std::index_sequence) { + + /* fill in pointers for the device -- we're relying on managed memory for this simple wrapper */ + typename std::tuple_element_t::span_tuple_type>::element_type *ptr; + size_t size; + ptr = view.template get_device_ptr(); + size = view.template get_device_size(); + //cudaMalloc(&ptr, span.size_bytes()); + std::get(dev_spans) = ttg::ViewSpan(ptr, size, view.template get_scope()); + + /* copy data to device */ + //cudaMemcpy(ptr, span.data(), span.size_bytes(), cudaMemcpyHostToDevice); + if (view.template get_span().is_sync_in()) { +#if defined(TTG_HAVE_CUDA) && defined(TTG_USE_CUDA_PREFETCH) + cudaMemPrefetchAsync(span.data(), span.size_bytes(), 0, *ts_stream); +#endif // TTG_USE_CUDA_PREFETCH + } + + if constexpr(sizeof...(Is) > 0) { + create_view_on_device(view, dev_spans, std::index_sequence{}); + } + } + + template + auto make_view_from_tuple(HostT& obj, std::tuple...>& spans, std::index_sequence) { + return ttg::make_view(obj, std::get(spans)...); + } + + template + inline void create_on_device(std::tuple& views, std::tuple& dev_views, std::index_sequence) { + + using view_tuple_t = typename std::tuple; + auto& view = std::get(views); + typename std::tuple_element_t::span_tuple_type dev_spans; + create_view_on_device(view, dev_spans, std::make_index_sequence::size()>()); + + /* set the view for the device */ + std::get(dev_views) = make_view_from_tuple(view.get_host_object(), dev_spans, std::make_index_sequence>{}); + if constexpr(sizeof...(Is) > 0) { + create_on_device(views, dev_views, std::index_sequence{}); + } + } + + template + inline void sync_view_to_host(ttg::View& dev_view, std::index_sequence) { + /* prefetch back to host */ + auto span = dev_view.template get_span(); + + /* prefetch data from device */ + if (span.is_sync_out()) { +#if defined(TTG_HAVE_CUDA) && defined(TTG_USE_CUDA_PREFETCH) + cudaMemPrefetchAsync(span.data(), span.size_bytes(), cudaCpuDeviceId, *ts_stream); +#endif // TTG_USE_CUDA_PREFETCH + } + + if constexpr(sizeof...(Is) > 0) { + sync_view_to_host(dev_view, std::index_sequence{}); + } + } + + template + inline void sync_back_to_host(std::tuple& dev_views, std::index_sequence) { + + sync_view_to_host(std::get(dev_views), std::make_index_sequence>::size()>()); + + if constexpr(sizeof...(Is) > 0) { + sync_back_to_host(dev_views, std::index_sequence{}); + } + } + + template + auto make_device_tt_helper(DevViewFuncT &&view_func, + DevKernelFuncT &&kernel_func, + DevOutFuncT &&out_func, + ttg::ExecutionSpace space, + const std::tuple...> &inedges, + const std::tuple &outedges, + const std::string &name, + const std::vector &innames, + const std::vector &outnames, + const ttg::typelist& full_input_args) { + + using output_terminals_type = typename ttg::edges_to_output_terminals>::type; + + auto taskfn = [=](const keyT& key, Args... args) mutable { + +#ifdef TTG_HAVE_CUDA + if (nullptr == ts_stream) { + ts_stream = new cudaStream_t(); + cudaStreamCreate(ts_stream); + } +#endif // TTG_HAVE_CUDA + + auto views = view_func(key, std::forward(args)...); + using view_tuple_t = std::remove_reference_t; + constexpr std::size_t view_tuple_size = std::tuple_size_v; + /* 1) allocate memory on device */ + auto device_views = views; + /* 2) move data from views to device */ + if constexpr(std::tuple_size_v > 0) { + create_on_device(views, device_views, std::make_index_sequence()); + } + /* 3) call kernel function */ + detail::invoke_with_unpacked_views(kernel_func, key, device_views, std::make_index_sequence()); + /* 4) move data back out into host objects */ + if constexpr(std::tuple_size_v > 0) { + sync_back_to_host(device_views, std::make_index_sequence()); + } + #ifdef TTG_HAVE_CUDA + /* wait for the */ + cudaStreamSynchronize(*ts_stream); + #endif // TTG_HAVE_CUDA + /* 5) call output function */ + detail::invoke_out_with_unpacked_views(out_func, key, views, std::make_index_sequence()); + }; + + using wrapT = typename CallableWrapTTArgsAsTypelist>::type; + + return std::make_unique(std::move(taskfn), inedges, outedges, name, innames, outnames); + + } + + +} // namespace detail + + +template +auto make_device_tt(DevViewFuncT &&view_func, + DevKernelFuncT &&kernel_func, + DevOutFuncT &&out_func, + ttg::ExecutionSpace space, + const std::tuple...> &inedges, + const std::tuple &outedges, const std::string &name = "wrapper", + const std::vector &innames = std::vector(sizeof...(input_edge_valuesT), "input"), + const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { + + using output_terminals_type = typename ttg::edges_to_output_terminals>::type; + + constexpr auto void_key = ttg::meta::is_void_v; + + // gross list of candidate argument types + using gross_candidate_func_args_t = ttg::meta::typelist< + ttg::meta::candidate_argument_bindings_t>, + ttg::meta::candidate_argument_bindings_t::value_type>..., + ttg::meta::typelist>; + + // net list of candidate argument types excludes the empty typelists for void arguments + using candidate_func_args_t = ttg::meta::filter_t; + + // gross argument typelist for invoking func, can include void for optional args + constexpr static auto func_is_generic = ttg::meta::is_generic_callable_v; + using gross_func_args_t = decltype(ttg::meta::compute_arg_binding_types_r(view_func, candidate_func_args_t{})); + constexpr auto DETECTED_HOW_TO_INVOKE_GENERIC_FUNC = + func_is_generic ? !std::is_same_v> : true; + static_assert(DETECTED_HOW_TO_INVOKE_GENERIC_FUNC, + "ttd::make_tt(func, inedges, ...): could not detect how to invoke generic callable func, either the " + "signature of func " + "is faulty, or inedges does match the expected list of types, or both"); + + // net argument typelist + using func_args_t = ttg::meta::drop_void_t; + constexpr auto num_args = std::tuple_size_v; + + // if given task id, make sure it's passed via const lvalue ref + constexpr bool TASK_ID_PASSED_AS_CONST_LVALUE_REF = + !void_key ? ttg::meta::probe_first_v : true; + static_assert(TASK_ID_PASSED_AS_CONST_LVALUE_REF, + "ttg::make_tt(func, ...): if given to func, the task id must be passed by const lvalue ref"); + + // if given out-terminal tuple, make sure it's passed via nonconst lvalue ref + constexpr bool have_outterm_tuple = + func_is_generic ? !ttg::meta::is_last_void_v + : ttg::meta::probe_last_v; + constexpr bool OUTTERM_TUPLE_PASSED_AS_NONCONST_LVALUE_REF = + have_outterm_tuple ? ttg::meta::probe_last_v : true; + static_assert( + OUTTERM_TUPLE_PASSED_AS_NONCONST_LVALUE_REF, + "ttg::make_tt(func, ...): if given to func, the output terminal tuple must be passed by nonconst lvalue ref"); + + // TT needs actual types of arguments to func ... extract them and pass to CallableWrapTTArgs + using input_edge_value_types = ttg::meta::typelist...>; + // input_args_t = {input_valuesT&&...} + using input_args_t = typename ttg::meta::take_first_n< + typename ttg::meta::drop_first_n::type, + std::tuple_size_v - (void_key ? 0 : 1) - (have_outterm_tuple ? 1 : 0)>::type; + constexpr auto NO_ARGUMENTS_PASSED_AS_NONCONST_LVALUE_REF = + !ttg::meta::is_any_nonconst_lvalue_reference_v; + static_assert( + NO_ARGUMENTS_PASSED_AS_NONCONST_LVALUE_REF, + "ttg::make_tt(func, inedges, outedges): one or more arguments to func can only be passed by nonconst lvalue " + "ref; this is illegal, should only pass arguments as const lavlue ref or (nonconst) rvalue ref"); + using decayed_input_args_t = ttg::meta::decayed_typelist_t; + // 3. full_input_args_t = edge-types with non-void types replaced by input_args_t + using full_input_args_t = ttg::meta::replace_nonvoid_t; + + return detail::make_device_tt_helper(std::forward(view_func), + std::forward(kernel_func), + std::forward(out_func), + space, inedges, outedges, name, innames, outnames, + full_input_args_t{}); +} + +#if 0 +template +auto make_device_tt(HostFuncT &&host_func, + DevViewFuncT &&view_func, + DevKernelFuncT &&kernel_func, + DevOutFuncT &&out_func, + ttg::ExecutionSpace space, + const std::tuple...> &inedges, + const std::tuple &outedges, const std::string &name = "wrapper", + const std::vector &innames = std::vector(sizeof...(input_edge_valuesT), "input"), + const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { + + /* drop the host function */ + return make_device_tt(view_func, kernel_func, out_func, space, inedges, outedges, name, innames, outnames); +} +#endif // 0 +#endif // TTG_MAKE_DEVICE_TT_H + +// clang-format on diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h index 79da690df4..6ae210cef6 100644 --- a/ttg/ttg/make_tt.h +++ b/ttg/ttg/make_tt.h @@ -8,7 +8,8 @@ // case 1 (keyT != void): void op(auto&& key, std::tuple&&, std::tuple&) // case 2 (keyT == void): void op(std::tuple&&, std::tuple&) // -template class CallableWrapTT : public TT struct CallableWrapTTUnwrapTypelist; -template -struct CallableWrapTTUnwrapTypelist> { - using type = CallableWrapTT...>; }; @@ -122,15 +127,17 @@ struct CallableWrapTTUnwrapTypelist&) -// case 2 (keyT == void): void op(input_valuesT&&..., std::tuple&) +// case 1 (keyT != void): returnT op(auto&& key, input_valuesT&&..., std::tuple&) +// case 2 (keyT == void): returnT op(input_valuesT&&..., std::tuple&) // -template +// returnT is void for funcT = synchronous (ordinary) function and the appropriate return type for funcT=coroutine +template class CallableWrapTTArgs - : public TT, - ttg::typelist> { + : public TT< + keyT, output_terminalsT, + CallableWrapTTArgs, + ttg::typelist> { using baseT = typename CallableWrapTTArgs::ttT; using input_values_tuple_type = typename baseT::input_values_tuple_type; @@ -141,18 +148,93 @@ class CallableWrapTTArgs using noref_funcT = std::remove_reference_t; std::conditional_t, std::add_pointer_t, noref_funcT> func; + using op_return_type = +#ifdef TTG_HAS_COROUTINE + std::conditional_t, + ttg::coroutine_handle<>, + std::conditional_t, + ttg::device_task::base_type, + void>>; +#else // TTG_HAS_COROUTINE + void; +#endif // TTG_HAS_COROUTINE + +public: + static constexpr bool have_cuda_op = (space == ttg::ExecutionSpace::CUDA); + +protected: + + /// @return coroutine handle<> (if funcT is a coroutine), else void template - void call_func(Key &&key, Tuple &&args_tuple, output_terminalsT &out, std::index_sequence) { + auto call_func(Key &&key, Tuple &&args_tuple, output_terminalsT &out, std::index_sequence) { using func_args_t = ttg::meta::tuple_concat_t, input_refs_tuple_type, output_edges_type>; - if constexpr (funcT_receives_outterm_tuple) - func(std::forward(key), - baseT::template get>(std::forward(args_tuple))..., out); - else { + + auto process_return = [&out](auto &&ret) { + static_assert(std::is_same_v, returnT>, + "CallableWrapTTArgs: returnT does not match the actual return type of funcT"); + if constexpr (!std::is_void_v) { // protect from compiling for void returnT +#ifdef TTG_HAS_COROUTINE + if constexpr (std::is_same_v) { + ttg::coroutine_handle<> coro_handle; + // if task completed destroy it + if (ret.completed()) { + ret.destroy(); + } else { // if task is suspended return the coroutine promise ptr + coro_handle = ret; + } + return coro_handle; + } else if constexpr (std::is_same_v) { + ttg::device_task::base_type coro_handle = ret; + return coro_handle; + } + if constexpr (!(std::is_same_v || std::is_same_v)) +#endif + { + static_assert(std::tuple_size_v> == 1, + "CallableWrapTTArgs <= 2, + "CallableWrapTTArgs == 0) + std::get<0>(out).sendv(std::move(ret)); + else if constexpr (std::tuple_size_v == 1) + std::get<0>(out).sendk(std::move(std::get<0>(ret))); + else if constexpr (std::tuple_size_v == 2) + std::get<0>(out).send(std::move(std::get<0>(ret)), std::move(std::get<1>(ret))); + return; + } + } + }; + + if constexpr (funcT_receives_outterm_tuple) { + if constexpr (std::is_void_v) { + func(std::forward(key), + baseT::template get>(std::forward(args_tuple))..., out); + return; + } else { + auto ret = func( + std::forward(key), + baseT::template get>(std::forward(args_tuple))..., out); + + return process_return(std::move(ret)); + } + } else { auto old_output_tls_ptr = this->outputs_tls_ptr_accessor(); this->set_outputs_tls_ptr(); - func(std::forward(key), - baseT::template get>(std::forward(args_tuple))...); - this->set_outputs_tls_ptr(old_output_tls_ptr); + if constexpr (std::is_void_v) { + func(std::forward(key), + baseT::template get>(std::forward(args_tuple))...); + this->set_outputs_tls_ptr(old_output_tls_ptr); + return; + } else { + auto ret = + func(std::forward(key), + baseT::template get>(std::forward(args_tuple))...); + this->set_outputs_tls_ptr(old_output_tls_ptr); + return process_return(std::move(ret)); + } } } @@ -214,54 +296,55 @@ class CallableWrapTTArgs template std::enable_if_t && !ttg::meta::is_empty_tuple_v && !ttg::meta::is_void_v, - void> + op_return_type> op(Key &&key, ArgsTuple &&args_tuple, output_terminalsT &out) { assert(&out == &baseT::get_output_terminals()); - call_func(std::forward(key), std::forward(args_tuple), out, - std::make_index_sequence>{}); + return call_func(std::forward(key), std::forward(args_tuple), out, + std::make_index_sequence>{}); }; template std::enable_if_t && !ttg::meta::is_empty_tuple_v && ttg::meta::is_void_v, - void> + op_return_type> op(ArgsTuple &&args_tuple, output_terminalsT &out) { assert(&out == &baseT::get_output_terminals()); - call_func(std::forward(args_tuple), out, std::make_index_sequence>{}); + return call_func(std::forward(args_tuple), out, + std::make_index_sequence>{}); }; template - std::enable_if_t && !ttg::meta::is_void_v, void> op( + std::enable_if_t && !ttg::meta::is_void_v, op_return_type> op( Key &&key, output_terminalsT &out) { assert(&out == &baseT::get_output_terminals()); - call_func(std::forward(key), out); + return call_func(std::forward(key), out); }; template - std::enable_if_t && ttg::meta::is_void_v, void> op( + std::enable_if_t && ttg::meta::is_void_v, op_return_type> op( output_terminalsT &out) { assert(&out == &baseT::get_output_terminals()); - call_func(out); + return call_func(out); }; }; -template +template struct CallableWrapTTArgsAsTypelist; -template -struct CallableWrapTTArgsAsTypelist +struct CallableWrapTTArgsAsTypelist> { - using type = CallableWrapTTArgs...>; }; -template -struct CallableWrapTTArgsAsTypelist +struct CallableWrapTTArgsAsTypelist> { - using type = CallableWrapTTArgs...>; }; @@ -418,7 +501,9 @@ auto make_tt_tpl(funcT &&func, const std::tuple +template auto make_tt(funcT &&func, const std::tuple...> &inedges = std::tuple<>{}, const std::tuple &outedges = std::tuple<>{}, const std::string &name = "wrapper", const std::vector &innames = std::vector(sizeof...(input_edge_valuesT), "input"), @@ -444,7 +529,10 @@ auto make_tt(funcT &&func, const std::tuple. // gross argument typelist for invoking func, can include void for optional args constexpr static auto func_is_generic = ttg::meta::is_generic_callable_v; - using gross_func_args_t = decltype(ttg::meta::compute_arg_binding_types_r(func, candidate_func_args_t{})); + using return_type_typelist_and_gross_func_args_t = + decltype(ttg::meta::compute_arg_binding_types(func, candidate_func_args_t{})); + using func_return_t = std::tuple_element_t<0, std::tuple_element_t<0, return_type_typelist_and_gross_func_args_t>>; + using gross_func_args_t = std::tuple_element_t<1, return_type_typelist_and_gross_func_args_t>; constexpr auto DETECTED_HOW_TO_INVOKE_GENERIC_FUNC = func_is_generic ? !std::is_same_v> : true; static_assert(DETECTED_HOW_TO_INVOKE_GENERIC_FUNC, @@ -487,12 +575,21 @@ auto make_tt(funcT &&func, const std::tuple. using decayed_input_args_t = ttg::meta::decayed_typelist_t; // 3. full_input_args_t = edge-types with non-void types replaced by input_args_t using full_input_args_t = ttg::meta::replace_nonvoid_t; - using wrapT = typename CallableWrapTTArgsAsTypelist::type; + using wrapT = typename CallableWrapTTArgsAsTypelist::type; return std::make_unique(std::forward(func), inedges, outedges, name, innames, outnames); } +template +auto make_tt(funcT &&func, const std::tuple...> &inedges = std::tuple<>{}, + const std::tuple &outedges = std::tuple<>{}, const std::string &name = "wrapper", + const std::vector &innames = std::vector(sizeof...(input_edge_valuesT), "input"), + const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { + return make_tt(std::forward(func), inedges, outedges, name, innames, outnames); +} + template [[deprecated("use make_tt_tpl instead")]] inline auto wrapt( funcT &&func, const std::tuple...> &inedges, @@ -511,4 +608,6 @@ template (std::forward(func), inedges, outedges, name, innames, outnames); } +//#include "ttg/make_device_tt.h" + #endif // TTG_MAKE_TT_H diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h new file mode 100644 index 0000000000..8a293738ec --- /dev/null +++ b/ttg/ttg/parsec/buffer.h @@ -0,0 +1,417 @@ +#ifndef TTG_PARSEC_BUFFER_H +#define TTG_PARSEC_BUFFER_H + +// TODO: replace with short vector +#define TTG_PARSEC_MAX_NUM_DEVICES 4 + +#include +#include +#include +#include +#include +#include "ttg/parsec/ttg_data_copy.h" +#include "ttg/util/iovec.h" + +namespace ttg_parsec { + + +namespace detail { + // fwd decl + template + parsec_data_t* get_parsec_data(const ttg_parsec::buffer& db); +} // namespace detail + +/** + * A buffer that is mirrored between host memory + * and different devices. The runtime is free to + * move data between device and host memory based + * on where the tasks are executing. + * + * Note that a buffer is movable and should not + * be shared between two objects (e.g., through a pointer) + * in order for TTG to properly facilitate ownership + * tracking of the containing object. + */ +template +struct buffer { + + using element_type = std::decay_t; + + static_assert(std::is_trivially_copyable_v, + "Only trivially copyable types are supported for devices."); + static_assert(std::is_default_constructible_v, + "Only default constructible types are supported for devices."); + +private: + using delete_fn_t = std::add_pointer_t; + + using parsec_data_ptr = std::unique_ptr; + using host_data_ptr = std::unique_ptr; + parsec_data_ptr m_data; + host_data_ptr m_host_data; + std::size_t m_count = 0; + detail::ttg_data_copy_t *m_ttg_copy = nullptr; + + static void delete_owned(element_type *ptr) { + delete[] ptr; + } + + static void delete_non_owned(element_type *ptr) { + // nothing to be done, we don't own the memory + } + + static void delete_parsec_data(parsec_data_t *data) { + std::cout << "delete parsec_data " << data << std::endl; + parsec_data_destroy(data); + } + + static void delete_null_parsec_data(parsec_data_t *) { + // nothing to be done, only used for nullptr + } + + void create_host_copy() { + /* create a new copy for the host object */ + parsec_data_copy_t* copy; + copy = parsec_data_copy_new(m_data.get(), 0, parsec_datatype_int8_t, PARSEC_DATA_FLAG_PARSEC_MANAGED); + copy->device_private = m_host_data.get(); + copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; + copy->version = 1; // this version is valid + m_data->nb_elts = sizeof(element_type)*m_count; + m_data->owner_device = 0; + /* register the new data with the host copy */ + if (nullptr != m_ttg_copy) { + m_ttg_copy->add_device_data(m_data.get()); + } + } + + void reset() { + if (m_data) { + if (nullptr != m_ttg_copy) { + m_ttg_copy->remove_device_data(m_data.get()); + } + m_data.reset(); + m_count = 0; + } + } + + friend parsec_data_t* detail::get_parsec_data(const ttg_parsec::buffer&); + +public: + + /* The device ID of the CPU. */ + static constexpr int cpu_device = 0; + + buffer() : buffer(1) + { } + + buffer(std::size_t count) + : m_data(parsec_data_new(), &delete_parsec_data) + , m_host_data(new element_type[count](), &delete_owned) + , m_count(count) + , m_ttg_copy(detail::ttg_data_copy_container()) + { + create_host_copy(); + } + + /* Constructing a buffer using application-managed memory. + * The memory pointed to by ptr must be accessible during + * the life-time of the buffer. */ + buffer(element_type* ptr, std::size_t count = 1) + : m_data(parsec_data_new(), &parsec_data_destroy) + , m_host_data(ptr, &delete_non_owned) + , m_count(count) + , m_ttg_copy(detail::ttg_data_copy_container()) + { + create_host_copy(); + } + + ~buffer() { + unpin(); // make sure the copies are not pinned + /* remove the tracked copy */ + if (nullptr != m_ttg_copy && m_data) { + m_ttg_copy->remove_device_data(m_data.get()); + } + } + + /* allow moving device buffers */ + buffer(buffer&& db) + : m_data(std::move(db.m_data)) + , m_host_data(std::move(db.m_host_data)) + , m_count(db.m_count) + , m_ttg_copy(db.m_ttg_copy) + { + db.m_count = 0; + + if (nullptr == m_ttg_copy && nullptr != detail::ttg_data_copy_container()) { + m_ttg_copy = detail::ttg_data_copy_container(); + /* register with the new ttg_copy */ + m_ttg_copy->add_device_data(m_data.get()); + } + } + + /* explicitly disable copying of buffers + * TODO: should we allow this? What data to use? + */ + buffer(const buffer& db) = delete; +#if 0 + /* copy the host data but leave the devices untouched */ + buffer(const buffer& db) + : m_data(db.m_count ? parsec_data_new() : nullptr, + db.m_count ? &parsec_data_destroy : &delete_null_parsec_data) + , m_host_data(db.m_count ? new element_type[db.m_count] : nullptr, + db.m_count ? &delete_owned : delete_non_owned) + , m_count(db.m_count) + , m_ttg_copy(detail::ttg_data_copy_container()) + { + /* copy host data */ + std::copy(db.m_host_data.get(), + db.m_host_data.get() + m_count, + m_host_data.get()); + /* create the host copy with the allocated memory */ + create_host_copy(); + } +#endif // 0 + + /* allow moving device buffers */ + buffer& operator=(buffer&& db) { + m_data = std::move(db.m_data); + m_host_data = std::move(db.m_host_data); + m_count = db.m_count; + db.m_count = 0; + /* don't update the ttg_copy, we keep the connection */ + } + + /* explicitly disable copying of buffers + * TODO: should we allow this? What data to use? + */ + buffer& operator=(const buffer& db) = delete; + +#if 0 + /* copy the host buffer content but leave the devices untouched */ + buffer& operator=(const buffer& db) { + if (db.m_count == 0) { + m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data); + m_host_data = host_data_ptr(nullptr, &delete_non_owned); + } else { + m_data = parsec_data_ptr(parsec_data_new(), &parsec_data_destroy); + m_host_data = host_data_ptr(new element_type[db.m_count], &delete_owned); + /* copy host data */ + std::copy(db.m_host_data.get(), + db.m_host_data.get() + db.m_count, + m_host_data.get()); + /* create the host copy with the allocated memory */ + create_host_copy(); + } + m_count = db.m_count; + } +#endif // 0 + + /* set the current device, useful when a device + * buffer was modified outside of a TTG */ + void set_current_device(int device_id) { + assert(is_valid()); + /* make sure it's a valid device */ + assert(parsec_nb_devices > device_id); + /* make sure it's a valid copy */ + assert(m_data->device_copies[device_id] != nullptr); + m_data->owner_device = device_id; + } + + /* get the current device ID, i.e., the last updated + * device buffer. */ + int get_current_device() const { + assert(is_valid()); + return m_data->owner_device; + } + + /* get the current device pointer */ + element_type* current_device_ptr() { + assert(is_valid()); + return static_cast(m_data->device_copies[m_data->owner_device]->device_private); + } + + /* get the current device pointer */ + const element_type* current_device_ptr() const { + assert(is_valid()); + return static_cast(m_data->device_copies[m_data->owner_device]->device_private); + } + + /* get the device pointer at the given device + * \sa cpu_device + */ + element_type* device_ptr_on(int device_id) { + assert(is_valid()); + return static_cast(parsec_data_get_ptr(m_data.get(), device_id)); + } + + /* get the device pointer at the given device + * \sa cpu_device + */ + const element_type* device_ptr_on(int device_id) const { + assert(is_valid()); + return static_cast(parsec_data_get_ptr(m_data.get(), device_id)); + } + + element_type* host_ptr() { + return device_ptr_on(cpu_device); + } + + const element_type* host_ptr() const { + return device_ptr_on(cpu_device); + } + + bool is_valid_on(int device_id) const { + assert(is_valid()); + return (parsec_data_get_ptr(m_data.get(), device_id) != nullptr); + } + + void allocate_on(int device_id) { + /* TODO: need exposed PaRSEC memory allocator */ + } + + /* TODO: can we do this automatically? + * Pin the memory on all devices we currently track. + * Pinned memory won't be released by PaRSEC and can be used + * at any time. + */ + void pin() { + for (int i = 1; i < parsec_nb_devices; ++i) { + pin_on(i); + } + } + + /* Unpin the memory on all devices we currently track. */ + void unpin() { + if (!is_valid()) return; + for (int i = 1; i < parsec_nb_devices; ++i) { + unpin_on(i); + } + } + + /* Pin the memory on a given device */ + void pin_on(int device_id) { + /* TODO: how can we pin memory on a device? */ + } + + /* Pin the memory on a given device */ + void unpin_on(int device_id) { + /* TODO: how can we unpin memory on a device? */ + } + + bool is_valid() const { + return !!m_data; + } + + operator bool() const { + return is_valid(); + } + + std::size_t size() const { + return m_count; + } + + /* Reallocate the buffer with count elements */ + void reset(std::size_t count) { + /* TODO: can we resize if count is smaller than m_count? */ + /* drop the current data and reallocate */ + reset(); + if (count == 0) { + m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data); + m_host_data = host_data_ptr(nullptr, &delete_non_owned); + } else { + m_data = parsec_data_ptr(parsec_data_new(), &parsec_data_destroy); + m_host_data = host_data_ptr(new element_type[count], &delete_owned); + /* create the host copy with the allocated memory */ + create_host_copy(); + } + m_count = count; + /* don't touch the ttg_copy, we still belong to the same container */ + } + + /* Reset the buffer to use the ptr to count elements */ + void reset(T* ptr, std::size_t count = 1) { + /* TODO: can we resize if count is smaller than m_count? */ + /* drop the current data and reallocate */ + reset(); + if (nullptr == ptr) { + m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data); + m_host_data = host_data_ptr(nullptr, &delete_non_owned); + m_count = 0; + } else { + m_data = parsec_data_ptr(parsec_data_new(), &parsec_data_destroy); + m_host_data = host_data_ptr(ptr, &delete_non_owned); + /* create the host copy with the allocated memory */ + create_host_copy(); + m_count = count; + } + /* don't touch the ttg_copy, we still belong to the same container */ + } + + /* serialization support */ + +#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL + template + std::enable_if_t || + std::is_base_of_v> + serialize(Archive& ar) { + if constexpr (ttg::detail::is_output_archive_v) + std::size_t s = size(); + assert(m_ttg_copy != nullptr); // only tracked objects allowed + m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()}); + ar(s); + else { + std::size_t s; + ar(s); + reset(s); + assert(m_ttg_copy != nullptr); // only tracked objects allowed + m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()}); + } + ar(value); + } +#endif // TTG_SERIALIZATION_SUPPORTS_CEREAL + +#ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS + template + std::enable_if_t || + std::is_base_of_v> + serialize(Archive& ar) { + if constexpr (ttg::detail::is_output_archive_v) { + std::size_t s = size(); + ar& s; + assert(m_ttg_copy != nullptr); // only tracked objects allowed + m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()}); + } else { + std::size_t s; + ar & s; + /* initialize internal pointers and then reset */ + reset(s); + assert(m_ttg_copy != nullptr); // only tracked objects allowed + m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()}); + } + } +#endif // TTG_SERIALIZATION_SUPPORTS_MADNESS + + +}; + +template +struct is_buffer : std::false_type +{ }; + +template +struct is_buffer> : std::true_type +{ }; + +template +constexpr static const bool is_buffer_v = is_buffer::value; + +namespace detail { + template + parsec_data_t* get_parsec_data(const ttg_parsec::buffer& db) { + return const_cast(db.m_data.get()); + } +} // namespace detail + +} // namespace ttg_parsec + +#endif // TTG_PARSEC_BUFFER_H \ No newline at end of file diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h new file mode 100644 index 0000000000..f509138f9b --- /dev/null +++ b/ttg/ttg/parsec/devicefunc.h @@ -0,0 +1,192 @@ +#ifndef TTG_PARSEC_DEVICEFUNC_H +#define TTG_PARSEC_DEVICEFUNC_H + +#if defined(TTG_HAVE_CUDART) +#include +#endif + +#include "ttg/parsec/task.h" +#include + +#if defined(PARSEC_HAVE_CUDA) +#include +#endif // PARSEC_HAVE_CUDA + +namespace ttg_parsec { + namespace detail { + template + inline bool register_device_memory(std::tuple &views, std::index_sequence) { + static_assert(I < MAX_PARAM_COUNT, + "PaRSEC only supports MAX_PARAM_COUNT device input/outputs. " + "Increase MAX_PARAM_COUNT and recompile PaRSEC/TTG."); + using view_type = std::remove_reference_t>>; + auto& view = std::get(views); + bool is_current = false; + static_assert(ttg::is_buffer_v || ttg_parsec::is_devicescratch_v); + /* get_parsec_data is overloaded for buffer and devicescratch */ + parsec_data_t* data = detail::get_parsec_data(view); + /* TODO: check whether the device is current */ + + auto flow_flags = PARSEC_FLOW_ACCESS_RW; + bool pushout = false; + if constexpr (std::is_const_v) { + flow_flags = PARSEC_FLOW_ACCESS_READ; + } else if constexpr (ttg_parsec::is_devicescratch_v) { + switch(view.scope()) { + case ttg::scope::Allocate: + flow_flags = PARSEC_FLOW_ACCESS_NONE; + break; + case ttg::scope::SyncIn: + flow_flags = PARSEC_FLOW_ACCESS_READ; + break; + } + } + assert(nullptr != detail::parsec_ttg_caller->dev_ptr); + parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task; + parsec_flow_t *flows = detail::parsec_ttg_caller->dev_ptr->flows; + + std::cout << "register_device_memory task " << detail::parsec_ttg_caller << " data " << I << " " + << data << " size " << data->nb_elts << std::endl; + + /* build the flow */ + /* TODO: reuse the flows of the task class? How can we control the sync direction then? */ + flows[I] = parsec_flow_t{.name = nullptr, + .sym_type = PARSEC_SYM_INOUT, + .flow_flags = static_cast(flow_flags), + .flow_index = I, + .flow_datatype_mask = ~0 }; + + gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes + gpu_task->flow[I] = &flows[I]; + + if (pushout) { + std::cout << "PUSHOUT " << I << std::endl; + gpu_task->pushout |= 1<parsec_task.data[I].data_in = data->device_copies[data->owner_device]; + detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[0]; + detail::parsec_ttg_caller->parsec_task.data[I].source_repo_entry = NULL; + + if constexpr (sizeof...(Is) > 0) { + is_current |= register_device_memory(views, std::index_sequence{}); + } + return is_current; + } + } // namespace detail + + /* Takes a tuple of ttg::Views or ttg::buffers and register them + * with the currently executing task. Returns true if all memory + * is current on the target device, false if transfers are required. */ + template + inline bool register_device_memory(std::tuple &views) { + if (nullptr == detail::parsec_ttg_caller) { + throw std::runtime_error("register_device_memory may only be invoked from inside a task!"); + } + + if (nullptr == detail::parsec_ttg_caller->dev_ptr) { + throw std::runtime_error("register_device_memory called inside a non-gpu task!"); + } + + bool is_current = detail::register_device_memory(views, std::index_sequence_for{}); + + /* reset all entries in the current task */ + for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) { + detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr; + detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; + detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i; + detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i]; + detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0; + } + + return is_current; + } + + namespace detail { + template + inline void mark_device_out(std::tuple &views, std::index_sequence) { + + using view_type = std::remove_reference_t>>; + auto& view = std::get(views); + + /* get_parsec_data is overloaded for buffer and devicescratch */ + parsec_data_t* data = detail::get_parsec_data(view); + /* find the data copy and mark it as pushout */ + int i = 0; + parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task; + parsec_gpu_exec_stream_t *stream = detail::parsec_ttg_caller->dev_ptr->stream; + /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */ + +#if defined(TTG_HAVE_CUDART) && defined(PARSEC_HAVE_CUDA) + parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)stream; + cudaMemcpyAsync(data->device_copies[0]->device_private, + data->device_copies[data->owner_device]->device_private, + data->nb_elts, cudaMemcpyDeviceToHost, cuda_stream->cuda_stream); +#else + static_assert(DeviceAvail, "No device implementation detected!"); +#endif // defined(PARSEC_HAVE_CUDA) + +#if 0 + while (detail::parsec_ttg_caller->parsec_task.data[i].data_in != nullptr) { + if (detail::parsec_ttg_caller->parsec_task.data[i].data_in == data->device_copies[0]) { + gpu_task->pushout |= 1< 0) { + // recursion + mark_device_out(views, std::index_sequence{}); + } + } + } // namespace detail + + template + inline void mark_device_out(std::tuple &b) { + + if (nullptr == detail::parsec_ttg_caller) { + throw std::runtime_error("mark_device_out may only be invoked from inside a task!"); + } + + if (nullptr == detail::parsec_ttg_caller->dev_ptr) { + throw std::runtime_error("mark_device_out called inside a non-gpu task!"); + } + + detail::mark_device_out(b, std::index_sequence_for{}); + } + + namespace detail { + + template + inline void post_device_out(std::tuple &views, std::index_sequence) { + + using view_type = std::remove_reference_t>>; + + if constexpr (!std::is_const_v) { + auto& view = std::get(views); + + /* get_parsec_data is overloaded for buffer and devicescratch */ + parsec_data_t* data = detail::get_parsec_data(view); + + data->device_copies[0]->version++; + data->owner_device = 0; + } + + if constexpr (sizeof...(Is) > 0) { + // recursion + post_device_out(views, std::index_sequence{}); + } + } + } // namespace detail + template + inline void post_device_out(std::tuple &b) { + detail::post_device_out(b, std::index_sequence_for{}); + } + + +} // namespace ttg_parsec + +#endif // TTG_PARSEC_DEVICEFUNC_H \ No newline at end of file diff --git a/ttg/ttg/parsec/devicescratch.h b/ttg/ttg/parsec/devicescratch.h new file mode 100644 index 0000000000..1c0487cdfb --- /dev/null +++ b/ttg/ttg/parsec/devicescratch.h @@ -0,0 +1,139 @@ +#ifndef TTG_PARSEC_DEVICESCRATCH_H +#define TTG_PARSEC_DEVICESCRATCH_H + +// TODO: replace with short vector +#define TTG_PARSEC_MAX_NUM_DEVICES 4 + +#include +#include +#include +#include +#include + +namespace ttg_parsec { + +namespace detail { + // fwd decl + template + parsec_data_t* get_parsec_data(const ttg_parsec::devicescratch&); +} // namespace detail + +/** + * Scratch-space for task-local variables. + * TTG will allocate memory on the device + * and transfer data in and out based on the scope. + */ +template +struct devicescratch { + + using element_type = std::decay_t; + + static_assert(std::is_trivially_copyable_v, + "Only trivially copyable types are supported for devices."); + static_assert(std::is_default_constructible_v, + "Only default constructible types are supported for devices."); + +private: + + parsec_data_t* m_data = nullptr; + parsec_data_copy_t m_data_copy; + ttg::scope m_scope; + + void create_host_copy(element_type *ptr, std::size_t count) { + /* TODO: is the construction call necessary? */ + /* TODO: handle the scope */ + PARSEC_OBJ_CONSTRUCT(&m_data_copy, parsec_data_copy_t); + m_data_copy.device_index = 0; + //m_data_copy.original = &m_data; + //m_data_copy.older = NULL; + m_data_copy.flags = PARSEC_DATA_FLAG_PARSEC_MANAGED; + m_data_copy.dtt = parsec_datatype_int8_t; + m_data_copy.version = 1; + m_data_copy.device_private = ptr; + m_data_copy.coherency_state = PARSEC_DATA_COHERENCY_SHARED; + + m_data->nb_elts = count * sizeof(element_type); + m_data->owner_device = 0; + parsec_data_copy_attach(m_data, &m_data_copy, 0); + } + + friend parsec_data_t* detail::get_parsec_data(const ttg_parsec::devicescratch&); + +public: + + /* Constructing a devicescratch using application-managed memory. + * The memory pointed to by ptr must be accessible during + * the life-time of the devicescratch. */ + devicescratch(element_type* ptr, ttg::scope scope = ttg::scope::SyncIn, std::size_t count = 1) + : m_data(parsec_data_new()) + , m_scope(scope) { + create_host_copy(ptr, count); + } + + /* don't allow moving */ + devicescratch(devicescratch&&) = delete; + + /* don't allow copying */ + devicescratch(const devicescratch& db) = delete; + + /* don't allow moving */ + devicescratch& operator=(devicescratch&&) = delete; + + /* don't allow copying */ + devicescratch& operator=(const devicescratch& db) = delete; + + ~devicescratch() { + PARSEC_OBJ_DESTRUCT(&m_data_copy); + parsec_data_destroy(m_data); + m_data = nullptr; + } + + /* get the current device pointer */ + element_type* device_ptr() { + assert(is_valid()); + return static_cast(m_data->device_copies[m_data->owner_device]->device_private); + } + + /* get the current device pointer */ + const element_type* device_ptr() const { + assert(is_valid()); + return static_cast(m_data->device_copies[m_data->owner_device]->device_private); + } + + bool is_valid() const { + // TODO: how to get the current device + // return (m_data->owner_device == parsec_current_device); + return true; + } + + ttg::scope scope() const { + return m_scope; + } + + std::size_t size() const { + return (m_data->nb_elts / sizeof(element_type)); + } + +}; + +template +struct is_devicescratch : std::false_type +{ }; + +template +struct is_devicescratch> : std::true_type +{ }; + +template +constexpr static const bool is_devicescratch_v = is_devicescratch::value; + +namespace detail { + template + parsec_data_t* get_parsec_data(const ttg_parsec::devicescratch& scratch) { + return const_cast(scratch.m_data); + } +} // namespace detail + +} // namespace ttg_parsec + +#endif // TTG_PARSEC_DEVICESCRATCH_H \ No newline at end of file diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h index 959dbdaac0..53c637136a 100644 --- a/ttg/ttg/parsec/fwd.h +++ b/ttg/ttg/parsec/fwd.h @@ -13,6 +13,26 @@ namespace ttg_parsec { template > class TT; + template + struct ptr; + + template + struct buffer; + template + struct devicescratch; + + template + inline bool register_device_memory(std::tuple &views); + + template + inline void mark_device_out(std::tuple &b); + + template + inline void post_device_out(std::tuple &b); + + /* the query of the parsec backend only returns whether the data should be marked for pushout */ + using query_result_type = bool; + /// \internal the OG name template using Op [[deprecated("use TT instead")]] = TT>; @@ -26,6 +46,7 @@ namespace ttg_parsec { inline void ttg_finalize(); + [[noreturn]] static inline void ttg_abort(); inline ttg::World ttg_default_execution_context(); @@ -51,6 +72,21 @@ namespace ttg_parsec { template static void ttg_broadcast(ttg::World world, T &data, int source_rank); + namespace device { + class DeviceAllocator; + std::size_t nb_devices(); + } + +#if 0 + template + inline std::pair>...>> get_ptr(Args&&... args); +#endif + template + inline ptr> get_ptr(T&& obj); + + template + inline ptr make_ptr(Args&&... args); + } // namespace ttg_parsec #endif // TTG_PARSEC_FWD_H diff --git a/ttg/ttg/parsec/ptr.h b/ttg/ttg/parsec/ptr.h new file mode 100644 index 0000000000..6499e050b7 --- /dev/null +++ b/ttg/ttg/parsec/ptr.h @@ -0,0 +1,282 @@ +#ifndef TTG_PARSEC_PTR_H +#define TTG_PARSEC_PTR_H + +#include +#include + +#include "ttg/parsec/ttg_data_copy.h" +#include "ttg/parsec/thread_local.h" +#include "ttg/parsec/task.h" + +namespace ttg_parsec { + + // fwd decl + template + struct ptr; + + namespace detail { + /* fwd decl */ + template + inline ttg_data_copy_t *create_new_datacopy(Value &&value); + + struct ptr { + using copy_type = detail::ttg_data_copy_t; + + private: + static inline std::unordered_map m_ptr_map; + static inline std::mutex m_ptr_map_mtx; + + copy_type *m_copy = nullptr; + + void drop_copy() { + std::cout << "ptr drop_copy " << m_copy << " ref " << m_copy->num_ref() << std::endl; + if (nullptr != m_copy && 1 == m_copy->drop_ref()) { + delete m_copy; + } + m_copy = nullptr; + } + + void register_self() { + /* insert ourselves from the list of ptr */ + std::lock_guard {m_ptr_map_mtx}; + m_ptr_map.insert(std::pair{this, true}); + } + + void deregister_self() { + /* remove ourselves from the list of ptr */ + std::lock_guard _{m_ptr_map_mtx}; + if (m_ptr_map.contains(this)) { + m_ptr_map.erase(this); + } + } + + public: + ptr(copy_type *copy) + : m_copy(copy) + { + register_self(); + m_copy->add_ref(); + std::cout << "ptr copy_obj ref " << m_copy->num_ref() << std::endl; + } + + copy_type* get_copy() const { + return m_copy; + } + + ptr(const ptr& p) + : m_copy(p.m_copy) + { + register_self(); + m_copy->add_ref(); + std::cout << "ptr cpy " << m_copy << " ref " << m_copy->num_ref() << std::endl; + } + + ptr(ptr&& p) + : m_copy(p.m_copy) + { + register_self(); + p.m_copy = nullptr; + std::cout << "ptr mov " << m_copy << " ref " << m_copy->num_ref() << std::endl; + } + + ~ptr() { + deregister_self(); + drop_copy(); + } + + ptr& operator=(const ptr& p) + { + drop_copy(); + m_copy = p.m_copy; + m_copy->add_ref(); + std::cout << "ptr cpy " << m_copy << " ref " << m_copy->num_ref() << std::endl; + return *this; + } + + ptr& operator=(ptr&& p) { + drop_copy(); + m_copy = p.m_copy; + p.m_copy = nullptr; + std::cout << "ptr mov " << m_copy << " ref " << m_copy->num_ref() << std::endl; + return *this; + } + + bool is_valid() const { + return (nullptr != m_copy); + } + + void reset() { + drop_copy(); + } + + /* drop all currently registered ptr + * \note this function is not thread-safe + * and should only be called at the + * end of the execution, e.g., during finalize. + */ + static void drop_all_ptr() { + for(auto it : m_ptr_map) { + it.first->drop_copy(); + } + } + }; + + + template + ttg_parsec::detail::ttg_data_copy_t* get_copy(ttg_parsec::ptr& p); + } // namespace detail + + template + ptr ttg_parsec::make_ptr(Args&&... args); + + template + ptr> ttg_parsec::get_ptr(T&& obj); + + template + struct ptr { + + using value_type = std::decay_t; + + private: + using copy_type = detail::ttg_data_value_copy_t; + + std::unique_ptr m_ptr; + + /* only PaRSEC backend functions are allowed to touch our private parts */ + template + friend ptr make_ptr(Args&&... args); + template + friend ptr> get_ptr(S&& obj); + template + friend detail::ttg_data_copy_t* detail::get_copy(ptr& p); + friend ttg::detail::value_copy_handler; + + /* only accessible by get_ptr and make_ptr */ + ptr(detail::ptr::copy_type *copy) + : m_ptr(new detail::ptr(copy)) + { } + + copy_type* get_copy() const { + return static_cast(m_ptr->get_copy()); + } + + public: + + ptr() = default; + + ptr(const ptr& p) + : ptr(p.get_copy()) + { } + + ptr(ptr&& p) = default; + + ~ptr() = default; + + ptr& operator=(const ptr& p) { + m_ptr.reset(new detail::ptr(p.get_copy())); + return *this; + } + + ptr& operator=(ptr&& p) = default; + + value_type& operator*() const { + return **static_cast(m_ptr->get_copy()); + } + + value_type& operator->() const { + return **static_cast(m_ptr->get_copy()); + } + + bool is_valid() const { + return m_ptr && m_ptr->is_valid(); + } + + void reset() { + m_ptr.reset(); + } + }; + +#if 0 + namespace detail { + template + inline auto get_ptr(Arg&& obj) { + + for (int i = 0; i < detail::parsec_ttg_caller->data_count; ++i) { + detail::ttg_data_copy_t *copy = detail::parsec_ttg_caller->copies[i]; + if (nullptr != copy) { + if (copy->get_ptr() == &obj) { + bool is_ready = true; + /* TODO: how can we force-sync host and device? Current data could be on either. */ +#if 0 + /* check all tracked device data for validity */ + for (auto it : copy) { + parsec_data_t *data = *it; + for (int i = 0; i < parsec_nb_devices; ++i) { + if (nullptr != data->device_copies[i]) { + + } else { + is_ready = false; + } + } + } +#endif // 0 + return std::make_pair(is_ready, std::tuple{ttg_parsec::ptr>(copy)}); + } + } + } + + throw std::runtime_error("ttg::get_ptr called on an unknown object!"); + } + } + + template + inline std::pair>...>> get_ptr(Args&&... args) { + if (nullptr == detail::parsec_ttg_caller) { + throw std::runtime_error("ttg::get_ptr called outside of a task!"); + } + + bool ready = true; + auto fn = [&](auto&& arg){ + auto pair = get_ptr(std::forward(arg)); + ready &= pair.first; + return std::move(pair.second); + }; + std::tuple>...> tpl = {(fn(std::forward(args)))...}; + return {ready, std::move(tpl)}; + } +#endif // 0 + + template + inline ptr> get_ptr(T&& obj) { + using ptr_type = ptr>; + if (nullptr != detail::parsec_ttg_caller) { + for (int i = 0; i < detail::parsec_ttg_caller->data_count; ++i) { + detail::ttg_data_copy_t *copy = detail::parsec_ttg_caller->copies[i]; + if (nullptr != copy) { + if (copy->get_ptr() == &obj) { + return ptr_type(copy); + } + } + } + } + /* object not tracked, make a new ptr that is now tracked */ + detail::ttg_data_copy_t *copy = detail::create_new_datacopy(obj); + return ptr_type(copy); + } + + template + inline ptr make_ptr(Args&&... args) { + detail::ttg_data_copy_t *copy = detail::create_new_datacopy(T(std::forward(args)...)); + return ptr(copy); + } + + namespace detail { + template + detail::ttg_data_copy_t* get_copy(ttg_parsec::ptr& p) { + return p.get_copy(); + } + } // namespace detail + +} // namespace ttg_parsec + +#endif // TTG_PARSEC_PTR_H \ No newline at end of file diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h new file mode 100644 index 0000000000..3636e903a5 --- /dev/null +++ b/ttg/ttg/parsec/task.h @@ -0,0 +1,210 @@ +#ifndef TTG_PARSEC_TASK_H +#define TTG_PARSEC_TASK_H + +#include "ttg/parsec/ttg_data_copy.h" + +#include +#include + +namespace ttg_parsec { + + namespace detail { + + struct device_ptr_t { + parsec_gpu_task_t* gpu_task = nullptr; + parsec_flow_t* flows = nullptr; + parsec_gpu_exec_stream_t* stream = nullptr; + }; + + template + struct device_state_t + { + static constexpr bool support_device = false; + static constexpr size_t num_flows = 0; + device_state_t() + { } + static constexpr device_ptr_t* dev_ptr() { + return nullptr; + } + }; + + template<> + struct device_state_t { + static constexpr bool support_device = false; + static constexpr size_t num_flows = MAX_PARAM_COUNT; + parsec_flow_t m_flows[num_flows]; + device_ptr_t m_dev_ptr = {nullptr, &m_flows[0], nullptr}; // gpu_task will be allocated in each task + device_ptr_t* dev_ptr() { + return &m_dev_ptr; + } + }; + + typedef parsec_hook_return_t (*parsec_static_op_t)(void *); // static_op will be cast to this type + + struct parsec_ttg_task_base_t { + parsec_task_t parsec_task; + int32_t in_data_count = 0; //< number of satisfied inputs + int32_t data_count = 0; //< number of data elements in the copies array + ttg_data_copy_t **copies; //< pointer to the fixed copies array of the derived task + parsec_hash_table_item_t tt_ht_item = {}; + parsec_static_op_t function_template_class_ptr[ttg::runtime_traits::num_execution_spaces] = + {nullptr}; + + typedef struct { + std::size_t goal; + std::size_t size; + } size_goal_t; + + typedef void (release_task_fn)(parsec_ttg_task_base_t*); + /* Poor-mans virtual function + * We cannot use virtual inheritance or private visibility because we + * need offsetof for the mempool and scheduling. + */ + release_task_fn* release_task_cb = nullptr; + device_ptr_t* dev_ptr; + bool remove_from_hash = true; + bool is_dummy = false; + bool defer_writer = TTG_PARSEC_DEFER_WRITER; // whether to defer writer instead of creating a new copy + + + /* + virtual void release_task() = 0; + */ + //public: + void release_task() { + release_task_cb(this); + } + + protected: + /** + * Protected constructors: this class should not be instantiated directly + * but always be use through parsec_ttg_task_t. + */ + + parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, + int data_count, ttg_data_copy_t **copies, device_ptr_t *dev_ptr, + bool defer_writer = TTG_PARSEC_DEFER_WRITER) + : data_count(data_count) + , copies(copies) + , dev_ptr(dev_ptr) + , defer_writer(defer_writer) { + PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super); + parsec_task.mempool_owner = mempool; + parsec_task.task_class = task_class; + parsec_task.priority = 0; + } + + parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, + parsec_taskpool_t *taskpool, int32_t priority, + int data_count, ttg_data_copy_t **copies, device_ptr_t *dev_ptr, + release_task_fn *release_fn, + bool defer_writer = TTG_PARSEC_DEFER_WRITER) + : data_count(data_count) + , copies(copies) + , release_task_cb(release_fn) + , dev_ptr(dev_ptr) + , defer_writer(defer_writer) { + PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super); + parsec_task.mempool_owner = mempool; + parsec_task.task_class = task_class; + parsec_task.status = PARSEC_TASK_STATUS_HOOK; + parsec_task.taskpool = taskpool; + parsec_task.priority = priority; + parsec_task.chore_mask = 1<<0; + } + + public: + void set_dummy(bool d) { is_dummy = d; } + bool dummy() { return is_dummy; } + }; + + template > + struct parsec_ttg_task_t : public parsec_ttg_task_base_t { + using key_type = typename TT::key_type; + static constexpr size_t num_streams = TT::numins; + /* device tasks may have to store more copies than it's inputs as their sends are aggregated */ + static constexpr size_t num_copies = TT::derived_has_cuda_op() ? static_cast(MAX_PARAM_COUNT) + : (num_streams+1); + TT* tt; + key_type key; + size_goal_t stream[num_streams] = {}; +#ifdef TTG_HAS_COROUTINE + void* suspended_task_address = nullptr; // if not null the function is suspended +#endif + ttg_data_copy_t *copies[num_copies] = { nullptr }; // the data copies tracked by this task + device_state_t dev_state; + + parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class) + : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies, dev_state.dev_ptr()) { + tt_ht_item.key = pkey(); + + // We store the hash of the key and the address where it can be found in locals considered as a scratchpad + *(uintptr_t*)&(parsec_task.locals[0]) = 0; //there is no key + *(uintptr_t*)&(parsec_task.locals[2]) = 0; //there is no key + } + + parsec_ttg_task_t(const key_type& key, parsec_thread_mempool_t *mempool, + parsec_task_class_t *task_class, parsec_taskpool_t *taskpool, + TT *tt_ptr, int32_t priority) + : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority, + num_streams, copies, dev_state.dev_ptr(), + &release_task, tt_ptr->m_defer_writer) + , tt(tt_ptr), key(key) { + tt_ht_item.key = pkey(); + + // We store the hash of the key and the address where it can be found in locals considered as a scratchpad + uint64_t hv = ttg::hash>{}(key); + *(uintptr_t*)&(parsec_task.locals[0]) = hv; + *(uintptr_t*)&(parsec_task.locals[2]) = reinterpret_cast(&this->key); + } + + static void release_task(parsec_ttg_task_base_t* task_base) { + parsec_ttg_task_t *task = static_cast(task_base); + TT *tt = task->tt; + tt->release_task(task); + } + + parsec_key_t pkey() { return reinterpret_cast(&key); } + }; + + template + struct parsec_ttg_task_t : public parsec_ttg_task_base_t { + static constexpr size_t num_streams = TT::numins; + TT* tt; + size_goal_t stream[num_streams] = {}; +#ifdef TTG_HAS_COROUTINE + void* suspended_task_address = nullptr; // if not null the function is suspended +#endif + ttg_data_copy_t *copies[num_streams+1] = { nullptr }; // the data copies tracked by this task + // +1 for the copy needed during send/bcast + device_state_t dev_state; + + parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class) + : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies, dev_state.dev_ptr()) { + tt_ht_item.key = pkey(); + } + + parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, + parsec_taskpool_t *taskpool, TT *tt_ptr, int32_t priority) + : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority, + num_streams, copies, dev_state.dev_ptr(), + &release_task, tt_ptr->m_defer_writer) + , tt(tt_ptr) { + tt_ht_item.key = pkey(); + } + + static void release_task(parsec_ttg_task_base_t* task_base) { + parsec_ttg_task_t *task = static_cast(task_base); + TT *tt = task->tt; + tt->release_task(task); + } + + parsec_key_t pkey() { return 0; } + }; + + + } // namespace detail + +} // namespace ttg_parsec + +#endif // TTG_PARSEC_TASK_H \ No newline at end of file diff --git a/ttg/ttg/parsec/thread_local.h b/ttg/ttg/parsec/thread_local.h new file mode 100644 index 0000000000..54b98885ec --- /dev/null +++ b/ttg/ttg/parsec/thread_local.h @@ -0,0 +1,22 @@ +#ifndef TTG_PARSEC_THREAD_LOCAL_H +#define TTG_PARSEC_THREAD_LOCAL_H + +namespace ttg_parsec { + +namespace detail { + + // fwd decls + struct parsec_ttg_task_base_t; + struct ttg_data_copy_t; + + inline thread_local parsec_ttg_task_base_t *parsec_ttg_caller = nullptr; + + inline ttg_data_copy_t*& ttg_data_copy_container() { + static thread_local ttg_data_copy_t *ptr = nullptr; + return ptr; + } + +} // namespace detail +} // namespace ttg_parsec + +#endif // TTG_PARSEC_THREAD_LOCAL_H \ No newline at end of file diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index b0d8a91865..256341dba7 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -7,6 +7,11 @@ #define TTG_USE_PARSEC 1 #endif // !defined(TTG_IMPL_NAME) +/* Whether to defer a potential writer if there are readers. + * This may avoid extra copies in exchange for concurrency. + * This may cause deadlocks, so use with caution. */ +#define TTG_PARSEC_DEFER_WRITER false + #include "ttg/impl_selector.h" /* include ttg header to make symbols available in case this header is included directly */ @@ -31,8 +36,15 @@ #include "ttg/serialization/data_descriptor.h" +#include "ttg/view.h" + #include "ttg/parsec/fwd.h" +#include "ttg/parsec/buffer.h" +#include "ttg/parsec/devicescratch.h" +#include "ttg/parsec/thread_local.h" +#include "ttg/parsec/devicefunc.h" + #include #include #include @@ -50,15 +62,30 @@ #include #include +// needed for MPIX_CUDA_AWARE_SUPPORT +#include +#include + + +/* TODO: remove once we use PaRSEC master */ +#ifndef PARSEC_HAVE_CUDA +#define PARSEC_HAVE_CUDA 1 +#endif // PARSEC_HAVE_CUDA + #include #include #include #include #include #include +#include #include #include #include +#include +/* TODO: once we use parsec master we need to switch this include */ +#include +#include #if defined(PARSEC_PROF_TRACE) #include #undef PARSEC_TTG_PROFILE_BACKEND @@ -70,6 +97,9 @@ #include #include "ttg/parsec/ttg_data_copy.h" +#include "ttg/parsec/thread_local.h" +#include "ttg/parsec/ptr.h" +#include "ttg/parsec/task.h" #undef TTG_PARSEC_DEBUG_TRACK_DATA_COPIES @@ -77,17 +107,14 @@ #include #endif -/* Whether to defer a potential writer if there are readers. - * This may avoid extra copies in exchange for concurrency. - * This may cause deadlocks, so use with caution. */ -#define TTG_PARSEC_DEFER_WRITER false - /* PaRSEC function declarations */ extern "C" { void parsec_taskpool_termination_detected(parsec_taskpool_t *tp); int parsec_add_fetch_runtime_task(parsec_taskpool_t *tp, int tasks); } +#include "ttg/view.h" + namespace ttg_parsec { inline thread_local parsec_execution_stream_t *parsec_ttg_es; @@ -99,16 +126,28 @@ namespace ttg_parsec { inline std::multimap delayed_unpack_actions; struct msg_header_t { - typedef enum { + typedef enum fn_id : std::int8_t { MSG_SET_ARG = 0, MSG_SET_ARGSTREAM_SIZE = 1, MSG_FINALIZE_ARGSTREAM_SIZE = 2, - MSG_GET_FROM_PULL =3 } fn_id_t; + MSG_GET_FROM_PULL = 3 } fn_id_t; uint32_t taskpool_id; uint64_t op_id; + std::size_t key_offset = 0; fn_id_t fn_id; + std::int8_t num_iovecs = 0; int32_t param_id; - int num_keys; + int num_keys = 0; + int sender; + + msg_header_t(fn_id_t fid, uint32_t tid, uint64_t oid, int32_t pid, int sender, int nk) + : fn_id(fid) + , taskpool_id(tid) + , op_id(oid) + , param_id(pid) + , num_keys(nk) + , sender(sender) + { } }; static void unregister_parsec_tags(void *_); @@ -167,8 +206,7 @@ namespace ttg_parsec { } // namespace detail class WorldImpl : public ttg::base::WorldImplBase { - static constexpr const int _PARSEC_TTG_TAG = 10; // This TAG should be 'allocated' at the PaRSEC level - static constexpr const int _PARSEC_TTG_RMA_TAG = 11; // This TAG should be 'allocated' at the PaRSEC level + int32_t parsec_comm_engine_cb_idx; ttg::Edge<> m_ctl_edge; bool _dag_profiling; @@ -186,6 +224,18 @@ namespace ttg_parsec { return comm_rank; } + static void ttg_parsec_ce_up(parsec_comm_engine_t *comm_engine, void *user_data) + { + parsec_ce.tag_register(WorldImpl::parsec_ttg_tag(), &detail::static_unpack_msg, user_data, PARSEC_TTG_MAX_AM_SIZE); + parsec_ce.tag_register(WorldImpl::parsec_ttg_rma_tag(), &detail::get_remote_complete_cb, user_data, 128); + } + + static void ttg_parsec_ce_down(parsec_comm_engine_t *comm_engine, void *user_data) + { + parsec_ce.tag_unregister(WorldImpl::parsec_ttg_tag()); + parsec_ce.tag_unregister(WorldImpl::parsec_ttg_rma_tag()); + } + public: #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND) int parsec_ttg_profile_backend_set_arg_start, parsec_ttg_profile_backend_set_arg_end; @@ -193,7 +243,7 @@ namespace ttg_parsec { int parsec_ttg_profile_backend_allocate_datacopy, parsec_ttg_profile_backend_free_datacopy; #endif - static constexpr const int PARSEC_TTG_MAX_AM_SIZE = 1024 * 1024; + static constexpr const int PARSEC_TTG_MAX_AM_SIZE = 4 * 1024; WorldImpl(int *argc, char **argv[], int ncores, parsec_context_t *c = nullptr) : WorldImplBase(query_comm_size(), query_comm_rank()) , ctx(c) @@ -228,21 +278,27 @@ namespace ttg_parsec { es = ctx->virtual_processes[0]->execution_streams[0]; - parsec_ce.tag_register(_PARSEC_TTG_TAG, &detail::static_unpack_msg, this, PARSEC_TTG_MAX_AM_SIZE); - parsec_ce.tag_register(_PARSEC_TTG_RMA_TAG, &detail::get_remote_complete_cb, this, 128); + parsec_comm_engine_cb_idx = parsec_comm_engine_register_callback(ttg_parsec_ce_up, this, ttg_parsec_ce_down, this); create_tpool(); } void create_tpool() { assert(nullptr == tpool); - tpool = (parsec_taskpool_t *)calloc(1, sizeof(parsec_taskpool_t)); + tpool = PARSEC_OBJ_NEW(parsec_taskpool_t); tpool->taskpool_id = -1; tpool->update_nb_runtime_task = parsec_add_fetch_runtime_task; tpool->taskpool_type = PARSEC_TASKPOOL_TYPE_TTG; - tpool->taskpool_name = (char*)"TTG Taskpool"; + tpool->taskpool_name = strdup("TTG Taskpool"); parsec_taskpool_reserve_id(tpool); + tpool->devices_index_mask = 0; + for(int i = 0; i < (int)parsec_nb_devices; i++) { + parsec_device_module_t *device = parsec_mca_device_get(i); + if( NULL == device ) continue; + tpool->devices_index_mask |= (1 << device->device_index); + } + #ifdef TTG_USE_USER_TERMDET parsec_termdet_open_module(tpool, "user_trigger"); #else // TTG_USE_USER_TERMDET @@ -254,7 +310,7 @@ namespace ttg_parsec { // be added by the main thread. It should then be initialized // to 0, execute will set it to 1 and mark the tpool as ready, // and the fence() will decrease it back to 0. - tpool->tdm.module->taskpool_set_nb_pa(tpool, 0); + tpool->tdm.module->taskpool_set_runtime_actions(tpool, 0); parsec_taskpool_enable(tpool, NULL, NULL, es, size() > 1); #if defined(PARSEC_PROF_TRACE) @@ -284,15 +340,15 @@ namespace ttg_parsec { ~WorldImpl() { destroy(); } - static constexpr int parsec_ttg_tag() { return _PARSEC_TTG_TAG; } - static constexpr int parsec_ttg_rma_tag() { return _PARSEC_TTG_RMA_TAG; } + static constexpr int parsec_ttg_tag() { return PARSEC_DSL_TTG_TAG; } + static constexpr int parsec_ttg_rma_tag() { return PARSEC_DSL_TTG_RMA_TAG; } MPI_Comm comm() const { return MPI_COMM_WORLD; } virtual void execute() override { if (!parsec_taskpool_started) { parsec_enqueue(ctx, tpool); - tpool->tdm.module->taskpool_addto_nb_pa(tpool, 1); + tpool->tdm.module->taskpool_addto_runtime_actions(tpool, 1); tpool->tdm.module->taskpool_ready(tpool); [[maybe_unused]] auto ret = parsec_context_start(ctx); // ignore ret since all of its nonzero values are OK (e.g. -1 due to ctx already being active) @@ -317,7 +373,7 @@ namespace ttg_parsec { if (is_valid()) { if (parsec_taskpool_started) { // We are locally ready (i.e. we won't add new tasks) - tpool->tdm.module->taskpool_addto_nb_pa(tpool, -1); + tpool->tdm.module->taskpool_addto_runtime_actions(tpool, -1); ttg::trace("ttg_parsec(", this->rank(), "): final waiting for completion"); if (own_ctx) parsec_context_wait(ctx); @@ -328,9 +384,9 @@ namespace ttg_parsec { ttg::detail::deregister_world(*this); destroy_tpool(); if (own_ctx) { - unregister_parsec_tags(nullptr); + unregister_parsec_tags(&parsec_comm_engine_cb_idx); } else { - parsec_context_at_fini(unregister_parsec_tags, nullptr); + parsec_context_at_fini(unregister_parsec_tags, &parsec_comm_engine_cb_idx); } #if defined(PARSEC_PROF_TRACE) if(nullptr != profiling_array) { @@ -354,8 +410,8 @@ namespace ttg_parsec { void increment_created() { taskpool()->tdm.module->taskpool_addto_nb_tasks(taskpool(), 1); } - void increment_inflight_msg() { taskpool()->tdm.module->taskpool_addto_nb_pa(taskpool(), 1); } - void decrement_inflight_msg() { taskpool()->tdm.module->taskpool_addto_nb_pa(taskpool(), -1); } + void increment_inflight_msg() { taskpool()->tdm.module->taskpool_addto_runtime_actions(taskpool(), 1); } + void decrement_inflight_msg() { taskpool()->tdm.module->taskpool_addto_runtime_actions(taskpool(), -1); } bool dag_profiling() override { return _dag_profiling; } @@ -438,7 +494,10 @@ namespace ttg_parsec { assert(0 == tpool->profiling_array[2*position]); assert(0 == tpool->profiling_array[2*position+1]); - parsec_profiling_add_dictionary_keyword(name, "fill:000000", 0, NULL, + // TODO PROFILING: 0 and NULL should be replaced with something that depends on the key human-readable serialization... + // Typically, we would put something like 3*sizeof(int32_t), "m{int32_t};n{int32_t};k{int32_t}" to say + // there are three fields, named m, n and k, stored in this order, and each of size int32_t + parsec_profiling_add_dictionary_keyword(name, "fill:000000", 64, "key{char[64]}", (int*)&tpool->profiling_array[2*position], (int*)&tpool->profiling_array[2*position+1]); } @@ -453,7 +512,7 @@ namespace ttg_parsec { } ttg::trace("ttg_parsec::(", rank, "): parsec taskpool is ready for completion"); // We are locally ready (i.e. we won't add new tasks) - tpool->tdm.module->taskpool_addto_nb_pa(tpool, -1); + tpool->tdm.module->taskpool_addto_runtime_actions(tpool, -1); ttg::trace("ttg_parsec(", rank, "): waiting for completion"); parsec_taskpool_wait(tpool); @@ -479,16 +538,15 @@ namespace ttg_parsec { #endif }; - static void unregister_parsec_tags(void *_) + static void unregister_parsec_tags(void *_pidx) { - parsec_ce.tag_unregister(WorldImpl::parsec_ttg_tag()); - parsec_ce.tag_unregister(WorldImpl::parsec_ttg_rma_tag()); + int32_t *pidx = static_cast(_pidx); + parsec_comm_engine_unregister_callback(*pidx); + *pidx = 0; } namespace detail { - typedef void (*parsec_static_op_t)(void *); // static_op will be cast to this type - const parsec_symbol_t parsec_taskclass_param0 = { .flags = PARSEC_SYMBOL_IS_STANDALONE|PARSEC_SYMBOL_IS_GLOBAL, .name = "HASH0", @@ -522,167 +580,14 @@ namespace ttg_parsec { .expr_inc = nullptr, .cst_inc = 0 }; - struct parsec_ttg_task_base_t { - parsec_task_t parsec_task; - int32_t in_data_count = 0; //< number of satisfied inputs - int32_t data_count = 0; //< number of data elements in parsec_task.data - parsec_hash_table_item_t tt_ht_item = {}; - parsec_static_op_t function_template_class_ptr[ttg::runtime_traits::num_execution_spaces] = - {nullptr}; - bool is_dummy = false; - bool defer_writer = TTG_PARSEC_DEFER_WRITER; // whether to defer writer instead of creating a new copy - - typedef void (release_task_fn)(parsec_ttg_task_base_t*); - - typedef struct { - std::size_t goal; - std::size_t size; - } size_goal_t; - - /* Poor-mans virtual function - * We cannot use virtual inheritance or private visibility because we - * need offsetof for the mempool and scheduling. - */ - release_task_fn* release_task_cb = nullptr; - bool remove_from_hash = true; - - /* - virtual void release_task() = 0; - */ - //public: - void release_task() { - release_task_cb(this); - } - - protected: - /** - * Protected constructors: this class should not be instantiated directly - * but always be use through parsec_ttg_task_t. - */ - - parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, int data_count, - bool defer_writer = TTG_PARSEC_DEFER_WRITER) - : data_count(data_count), defer_writer(defer_writer) { - PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super); - parsec_task.mempool_owner = mempool; - parsec_task.task_class = task_class; - parsec_task.priority = 0; - } - - parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, - parsec_taskpool_t *taskpool, int32_t priority, int data_count, - release_task_fn *release_fn, - bool defer_writer = TTG_PARSEC_DEFER_WRITER) - : data_count(data_count) - , defer_writer(defer_writer) - , release_task_cb(release_fn) { - int32_t p = priority; - PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super); - parsec_task.mempool_owner = mempool; - parsec_task.task_class = task_class; - parsec_task.status = PARSEC_TASK_STATUS_HOOK; - parsec_task.taskpool = taskpool; - parsec_task.priority = priority; - parsec_task.chore_id = 0; - } - - public: - void set_dummy(bool d) { is_dummy = d; } - bool dummy() { return is_dummy; } - }; - - template > - struct parsec_ttg_task_t : public parsec_ttg_task_base_t { - using key_type = typename TT::key_type; - static constexpr size_t num_streams = TT::numins; - TT* tt; - key_type key; - size_goal_t stream[num_streams] = {}; - - parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class) - : parsec_ttg_task_base_t(mempool, task_class, num_streams) { - tt_ht_item.key = pkey(); - - for (int i = 0; i < num_streams; ++i) { - parsec_task.data[i].data_in = nullptr; - } - - // We store the hash of the key and the address where it can be found in locals considered as a scratchpad - *(uintptr_t*)&(parsec_task.locals[0]) = 0; //there is no key - *(uintptr_t*)&(parsec_task.locals[2]) = 0; //there is no key - } - - parsec_ttg_task_t(const key_type& key, parsec_thread_mempool_t *mempool, - parsec_task_class_t *task_class, parsec_taskpool_t *taskpool, - TT *tt_ptr, int32_t priority) - : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority, - num_streams, &release_task, tt_ptr->m_defer_writer) - , tt(tt_ptr), key(key) { - tt_ht_item.key = pkey(); - - for (int i = 0; i < num_streams; ++i) { - parsec_task.data[i].data_in = nullptr; - } - - // We store the hash of the key and the address where it can be found in locals considered as a scratchpad - uint64_t hv = ttg::hash>{}(key); - *(uintptr_t*)&(parsec_task.locals[0]) = hv; - *(uintptr_t*)&(parsec_task.locals[2]) = reinterpret_cast(&this->key); - } - - static void release_task(parsec_ttg_task_base_t* task_base) { - parsec_ttg_task_t *task = static_cast(task_base); - TT *tt = task->tt; - tt->release_task(task); - } - - parsec_key_t pkey() { return reinterpret_cast(&key); } - }; - - template - struct parsec_ttg_task_t : public parsec_ttg_task_base_t { - static constexpr size_t num_streams = TT::numins; - TT* tt; - size_goal_t stream[num_streams] = {}; - - parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class) - : parsec_ttg_task_base_t(mempool, task_class, num_streams) { - tt_ht_item.key = pkey(); - - for (int i = 0; i < num_streams; ++i) { - parsec_task.data[i].data_in = nullptr; - } - } - - parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, - parsec_taskpool_t *taskpool, TT *tt_ptr, int32_t priority) - : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority, - num_streams, &release_task, tt_ptr->m_defer_writer) - , tt(tt_ptr) { - tt_ht_item.key = pkey(); - - for (int i = 0; i < num_streams; ++i) { - parsec_task.data[i].data_in = nullptr; - } - } - - static void release_task(parsec_ttg_task_base_t* task_base) { - parsec_ttg_task_t *task = static_cast(task_base); - TT *tt = task->tt; - tt->release_task(task); - } - - parsec_key_t pkey() { return 0; } - }; - inline ttg_data_copy_t *find_copy_in_task(parsec_ttg_task_base_t *task, const void *ptr) { ttg_data_copy_t *res = nullptr; if (task == nullptr || ptr == nullptr) { return res; } for (int i = 0; i < task->data_count; ++i) { - auto copy = static_cast(task->parsec_task.data[i].data_in); - if (NULL != copy && copy->device_private == ptr) { + auto copy = static_cast(task->copies[i]); + if (NULL != copy && copy->get_ptr() == ptr) { res = copy; break; } @@ -696,8 +601,8 @@ namespace ttg_parsec { return i; } for (i = 0; i < task->data_count; ++i) { - auto copy = static_cast(task->parsec_task.data[i].data_in); - if (NULL != copy && copy->device_private == ptr) { + auto copy = static_cast(task->copies[i]); + if (NULL != copy && copy->get_ptr() == ptr) { return i; } } @@ -713,7 +618,7 @@ namespace ttg_parsec { throw std::logic_error("Too many data copies, check MAX_PARAM_COUNT!"); } - task->parsec_task.data[task->data_count].data_in = copy; + task->copies[task->data_count] = copy; task->data_count++; return true; } @@ -722,17 +627,17 @@ namespace ttg_parsec { int i; /* find and remove entry; copies are usually appended and removed, so start from back */ for (i = task->data_count-1; i >= 0; --i) { - if (copy == task->parsec_task.data[i].data_in) { + if (copy == task->copies[i]) { break; } } if (i < 0) return; /* move all following elements one up */ for (; i < task->data_count - 1; ++i) { - task->parsec_task.data[i].data_in = task->parsec_task.data[i + 1].data_in; + task->copies[i] = task->copies[i + 1]; } /* null last element */ - task->parsec_task.data[i].data_in = nullptr; + task->copies[i] = nullptr; task->data_count--; } @@ -748,7 +653,13 @@ namespace ttg_parsec { template inline ttg_data_copy_t *create_new_datacopy(Value &&value) { using value_type = std::decay_t; - ttg_data_copy_t *copy = new ttg_data_value_copy_t(std::forward(value)); + ttg_data_copy_t *copy; + if constexpr (std::is_rvalue_reference_v || + std::is_copy_constructible_v>) { + copy = new ttg_data_value_copy_t(std::forward(value)); + } else { + throw std::logic_error("Trying to copy-construct data that is not copy-constructible!"); + } #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND) // Keep track of additional memory usage if(ttg::default_execution_context().impl().profiling()) { @@ -782,10 +693,11 @@ namespace ttg_parsec { inline parsec_hook_return_t hook_cuda(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { parsec_execution_stream_t *safe_es = parsec_ttg_es; parsec_ttg_es = es; + std::cout << "hook_cuda task " << parsec_task << std::endl; parsec_ttg_task_base_t *me = (parsec_ttg_task_base_t *)parsec_task; - me->function_template_class_ptr[static_cast(ttg::ExecutionSpace::CUDA)](parsec_task); + auto ret = me->function_template_class_ptr[static_cast(ttg::ExecutionSpace::CUDA)](parsec_task); parsec_ttg_es = safe_es; - return PARSEC_HOOK_RETURN_DONE; + return ret; } static parsec_key_fn_t parsec_tasks_hash_fcts = {.key_equal = parsec_hash_table_generic_64bits_key_equal, @@ -872,7 +784,7 @@ namespace ttg_parsec { } inline void release_data_copy(ttg_data_copy_t *copy) { - if (copy->is_mutable()) { + if (copy->is_mutable() && nullptr == copy->get_next_task()) { /* current task mutated the data but there are no consumers so prepare * the copy to be freed below */ copy->reset_readers(); @@ -882,19 +794,24 @@ namespace ttg_parsec { if (readers > 1) { /* potentially more than one reader, decrement atomically */ readers = copy->decrement_readers(); - } - /* if there was only one reader (the current task) we release the copy */ - if (1 == readers) { - if (nullptr != copy->push_task) { + } else if (readers == 1) { + /* make sure readers drop to zero */ + readers = copy->decrement_readers(); + } + /* if there was only one reader (the current task) or + * a mutable copy and a successor, we release the copy */ + if (1 == readers || copy->is_mutable()) { + if (nullptr != copy->get_next_task()) { /* Release the deferred task. - * The copy was mutable and will be mutated by the released task, - * so simply transfer ownership. - */ - parsec_task_t *push_task = copy->push_task; - copy->push_task = nullptr; - parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)push_task; + * The copy was mutable and will be mutated by the released task, + * so simply transfer ownership. + */ + parsec_task_t *next_task = copy->get_next_task(); + copy->set_next_task(nullptr); + parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)next_task; deferred_op->release_task(); - } else { + } else if ((1 == copy->num_ref()) || (1 == copy->drop_ref())) { + /* we are the last reference, delete the copy */ #if defined(TTG_PARSEC_DEBUG_TRACK_DATA_COPIES) { const std::lock_guard lock(pending_copies_mutex); @@ -930,10 +847,10 @@ namespace ttg_parsec { } if (readers == copy_in->mutable_tag) { - if (copy_res->push_task != nullptr) { + if (copy_res->get_next_task() != nullptr) { if (readonly) { - parsec_ttg_task_base_t *push_task = reinterpret_cast(copy_res->push_task); - if (push_task->defer_writer) { + parsec_ttg_task_base_t *next_task = reinterpret_cast(copy_res->get_next_task()); + if (next_task->defer_writer) { /* there is a writer but it signalled that it wants to wait for readers to complete */ return copy_res; } @@ -968,13 +885,12 @@ namespace ttg_parsec { * of the task */ copy_in->mark_mutable(); - assert(nullptr == copy_in->push_task); - assert(nullptr != task); - copy_in->push_task = &task->parsec_task; + assert(nullptr == copy_in->get_next_task()); + copy_in->set_next_task(&task->parsec_task); } else { - if (task->defer_writer && nullptr == copy_in->push_task) { + if (task->defer_writer && nullptr == copy_in->get_next_task()) { /* we're the first writer and want to wait for all readers to complete */ - copy_res->push_task = &task->parsec_task; + copy_res->set_next_task(&task->parsec_task); } else { /* there are writers and/or waiting already of this copy already, make a copy that we can mutate */ copy_res = NULL; @@ -983,19 +899,19 @@ namespace ttg_parsec { } if (NULL == copy_res) { - ttg_data_copy_t *new_copy = detail::create_new_datacopy(*static_cast(copy_in->device_private)); - if (replace && nullptr != copy_in->push_task) { + ttg_data_copy_t *new_copy = detail::create_new_datacopy(*static_cast(copy_in->get_ptr())); + if (replace && nullptr != copy_in->get_next_task()) { /* replace the task that was deferred */ - parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)copy_in->push_task; + parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)copy_in->get_next_task(); new_copy->mark_mutable(); /* replace the copy in the deferred task */ for (int i = 0; i < deferred_op->data_count; ++i) { - if (deferred_op->parsec_task.data[i].data_in == copy_in) { - deferred_op->parsec_task.data[i].data_in = new_copy; + if (deferred_op->copies[i] == copy_in) { + deferred_op->copies[i] = new_copy; break; } } - copy_in->push_task = nullptr; + copy_in->set_next_task(nullptr); deferred_op->release_task(); copy_in->reset_readers(); // set the copy back to being read-only copy_in->increment_readers(); // register as reader @@ -1012,8 +928,6 @@ namespace ttg_parsec { } // namespace detail - inline thread_local detail::parsec_ttg_task_base_t *parsec_ttg_caller; - inline void ttg_initialize(int argc, char **argv, int num_threads, parsec_context_t *ctx) { if (detail::initialized_mpi()) throw std::runtime_error("ttg_parsec::ttg_initialize: can only be called once"); @@ -1041,11 +955,13 @@ namespace ttg_parsec { if(0 == ttg::default_execution_context().rank()) ttg::default_execution_context().impl().final_task(); ttg::detail::set_default_world(ttg::World{}); // reset the default world + detail::ptr::drop_all_ptr(); ttg::detail::destroy_worlds(); if (detail::initialized_mpi()) MPI_Finalize(); } inline ttg::World ttg_default_execution_context() { return ttg::get_default_world(); } - inline void ttg_abort() { MPI_Abort(ttg_default_execution_context().impl().comm(), 1); } + [[noreturn]] + inline void ttg_abort() { MPI_Abort(ttg_default_execution_context().impl().comm(), 1); std::abort(); } inline void ttg_execute(ttg::World world) { world.impl().execute(); } inline void ttg_fence(ttg::World world) { world.impl().fence(); } @@ -1111,8 +1027,14 @@ namespace ttg_parsec { unsigned char bytes[WorldImpl::PARSEC_TTG_MAX_AM_SIZE - sizeof(msg_header_t)]; msg_t() = default; - msg_t(uint64_t tt_id, uint32_t taskpool_id, msg_header_t::fn_id_t fn_id, int32_t param_id, int num_keys = 1) - : tt_id{taskpool_id, tt_id, fn_id, param_id, num_keys} {} + msg_t(uint64_t tt_id, + uint32_t taskpool_id, + msg_header_t::fn_id_t fn_id, + int32_t param_id, + int sender, + int num_keys = 1) + : tt_id(fn_id, taskpool_id, tt_id, param_id, sender, num_keys) + {} }; } // namespace detail @@ -1135,7 +1057,7 @@ namespace ttg_parsec { // check for a non-type member named have_cuda_op template - using have_cuda_op_non_type_t = decltype(&T::have_cuda_op); + using have_cuda_op_non_type_t = decltype(T::have_cuda_op); bool alive = true; @@ -1144,6 +1066,7 @@ namespace ttg_parsec { static constexpr int numouts = std::tuple_size_v; // number of outputs static constexpr int numflows = std::max(numins, numouts); // max number of flows + public: /// @return true if derivedT::have_cuda_op exists and is defined to true static constexpr bool derived_has_cuda_op() { if constexpr (ttg::meta::is_detected_v) { @@ -1153,7 +1076,6 @@ namespace ttg_parsec { } } - public: using ttT = TT; using key_type = keyT; using input_terminals_type = ttg::detail::input_terminals_tuple_t; @@ -1230,6 +1152,13 @@ namespace ttg_parsec { constexpr static std::array get_from_pull_msg_fcts = make_get_from_pull_fcts(std::make_index_sequence{}); + template + constexpr static auto make_input_is_const(std::index_sequence) { + using resultT = decltype(input_is_const); + return resultT{{std::is_const_v>...}}; + } + constexpr static std::array input_is_const = make_input_is_const(std::make_index_sequence{}); + ttg::World world; ttg::meta::detail::keymap_t keymap; ttg::meta::detail::keymap_t priomap; @@ -1246,15 +1175,36 @@ namespace ttg_parsec { private: /// dispatches a call to derivedT::op if Space == Host, otherwise to derivedT::op_cuda if Space == CUDA + /// @return void if called a synchronous function, or ttg::coroutine_handle<> if called a coroutine (if non-null, + /// points to the suspended coroutine) template - void op(Args &&...args) { + auto op(Args &&...args) { derivedT *derived = static_cast(this); - if constexpr (Space == ttg::ExecutionSpace::Host) - derived->op(std::forward(args)...); - else if constexpr (Space == ttg::ExecutionSpace::CUDA) - derived->op_cuda(std::forward(args)...); + // TODO: do we still distinguish op and op_cuda? How do we handle support for multiple devices? + //if constexpr (Space == ttg::ExecutionSpace::Host) { + using return_type = decltype(derived->op(std::forward(args)...)); + if constexpr (std::is_same_v) { + derived->op(std::forward(args)...); + return; + } + else { + return derived->op(std::forward(args)...); + } +#if 0 + } + else if constexpr (Space == ttg::ExecutionSpace::CUDA) { + using return_type = decltype(derived->op_cuda(std::forward(args)...)); + if constexpr (std::is_same_v) { + derived->op_cuda(std::forward(args)...); + return; + } + else { + return derived->op_cuda(std::forward(args)...); + } + } else - abort(); + ttg::abort(); +#endif // 0 } template @@ -1277,7 +1227,8 @@ namespace ttg_parsec { auto &world_impl = world.impl(); parsec_taskpool_t *tp = world_impl.taskpool(); std::unique_ptr msg = std::make_unique(get_instance_id(), tp->taskpool_id, - msg_header_t::MSG_GET_FROM_PULL, i, 1); + msg_header_t::MSG_GET_FROM_PULL, i, + world.rank(), 1); /* pack the key */ size_t pos = 0; pos = pack(key, msg->bytes, pos); @@ -1299,60 +1250,364 @@ namespace ttg_parsec { static input_refs_tuple_type make_tuple_of_ref_from_array(task_t *task, std::index_sequence) { return input_refs_tuple_type{static_cast>( *reinterpret_cast> *>( - task->parsec_task.data[IS].data_in->device_private))...}; + task->copies[IS]->get_ptr()))...}; } + /** + * Submit callback called by PaRSEC once all input transfers have completed. + */ template - static void static_op(parsec_task_t *parsec_task) { + static int device_static_submit(parsec_device_gpu_module_t *gpu_device, + parsec_gpu_task_t *gpu_task, + parsec_gpu_exec_stream_t *gpu_stream) { + + task_t *task = (task_t*)gpu_task->ec; + // get the device task from the coroutine handle + ttg::device_task dev_task = ttg::device_task_handle_type::from_address(task->suspended_task_address); + + task->dev_ptr->stream = gpu_stream; + + std::cout << "device_static_submit task " << task << std::endl; + + // get the promise which contains the views + auto dev_data = dev_task.promise(); + + /* we should still be waiting for the transfer to complete */ + assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_TRANSFER); + + /* Here we call back into the coroutine again after the transfers have completed */ + static_op(&task->parsec_task); + + /* Get a new handle for the promise*/ + dev_task = ttg::device_task_handle_type::from_address(task->suspended_task_address); + dev_data = dev_task.promise(); + + assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_KERNEL || + dev_data.state() == ttg::TTG_DEVICE_CORO_COMPLETE); + + /* we will come back into this function once the kernel and transfers are */ + int rc = PARSEC_HOOK_RETURN_AGAIN; + if (ttg::TTG_DEVICE_CORO_COMPLETE == dev_data.state()) { + /* the task started sending so we won't come back here */ + rc = PARSEC_HOOK_RETURN_DONE; + } + return rc; + } + + template + static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) { + static_assert(derived_has_cuda_op()); + + int dev_index; + double ratio = 1.0; + task_t *task = (task_t*)parsec_task; - ttT *baseobj = task->tt; - derivedT *obj = static_cast(baseobj); - assert(parsec_ttg_caller == NULL); - parsec_ttg_caller = static_cast(task); - if (obj->tracing()) { - if constexpr (!ttg::meta::is_void_v) - ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": executing"); - else - ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : executing"); - } - - if constexpr (!ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { - auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence{}); - baseobj->template op(task->key, std::move(input), obj->output_terminals); - } else if constexpr (!ttg::meta::is_void_v && ttg::meta::is_empty_tuple_v) { - baseobj->template op(task->key, obj->output_terminals); - } else if constexpr (ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { - auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence{}); - baseobj->template op(std::move(input), obj->output_terminals); - } else if constexpr (ttg::meta::is_void_v && ttg::meta::is_empty_tuple_v) { - baseobj->template op(obj->output_terminals); - } else { - abort(); + parsec_execution_stream_s *es = task->tt->world.impl().execution_stream(); + + std::cout << "device_static_op: task " << parsec_task << std::endl; + + + /* set up a device task */ + parsec_gpu_task_t *gpu_task; + /* PaRSEC wants to free the gpu_task, because F***K ownerships */ + gpu_task = static_cast(std::calloc(1, sizeof(*gpu_task))); + PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); + gpu_task->ec = parsec_task; + gpu_task->task_type = 0; // user task + gpu_task->load = 1.0; // TODO: can we do better? + gpu_task->last_data_check_epoch = -1; // used internally + gpu_task->pushout = 0; + gpu_task->submit = &TT::device_static_submit; + + /* set the gpu_task so it's available in register_device_memory */ + task->dev_ptr->gpu_task = gpu_task; + + // first invocation of the coroutine to get the coroutine handle + static_op(parsec_task); + + /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ + + // get the device task from the coroutine handle + auto dev_task = ttg::device_task_handle_type::from_address(task->suspended_task_address); + + // get the promise which contains the views + ttg::device_task_promise_type& dev_data = dev_task.promise(); + + /* for now make sure we're waiting for transfers and the coro hasn't skipped this step */ + assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_TRANSFER); + + /* TODO: is this the right place to set the mask? */ + task->parsec_task.chore_mask = PARSEC_DEV_ALL; + /* get a device and come back if we need another one */ + dev_index = parsec_get_best_device(parsec_task, ratio); + assert(dev_index >= 0); + if (dev_index < 2) { + return PARSEC_HOOK_RETURN_NEXT; /* Fall back */ + } + + +#if 0 + // manage the gpu_task flows + + // set the input flows + uint8_t i = 0; + /* TODO: need to free flows at the end of the task lifetime */ + parsec_flow_t* flows = new parsec_flow_t[MAX_PARAM_COUNT]; + for (auto& view : dev_data) { + void *host_obj = view.host_obj(); + + /* iterate over all viewspans of this view (i.e., all memory ranges in this view) */ + for (auto& view_span : view) { + gpu_task->flow_nb_elts[i] = view_span.size(); // size in bytes + gpu_task->flow[i] = &flows[i]; + parsec_flow_t flow = {.name = nullptr, + .sym_type = PARSEC_SYM_INOUT, + .flow_flags = view_span.is_sync_out() ? PARSEC_FLOW_ACCESS_RW : PARSEC_FLOW_ACCESS_READ, + .flow_index = i, + .flow_datatype_mask = ~0 }; + std::cout << "view_span.is_sync_out() " << view_span.is_sync_out() << std::endl; + *(parsec_flow_t*)gpu_task->flow[i] = flow; // why are flows constant?! + + parsec_data_copy_t* copy = nullptr; + ttg_parsec::detail::ttg_data_copy_t* obj_copy = nullptr; + int input_obj_idx = -1; + /* try to find the view in the task and allocate a new copy if needed */ + for (int i = 0; nullptr == copy && i < numins; ++i) { + ttg_parsec::detail::ttg_data_copy_t* obj_copy = task->copies[i]; + if (obj_copy->get_ptr() == host_obj) { + for (auto& dev_copy : *obj_copy) { + if (view_span.data() == dev_copy->device_private) { + copy = dev_copy; + input_obj_idx = i; + break; + } + } + } + } + + /* push all data back out, EXCEPT if the host object is a const input for the task + * TODO [JS]: if PaRSEC lets us send from the device we can avoid pushing out here + */ + if (input_obj_idx == -1 || !input_is_const[input_obj_idx]) { + gpu_task->pushout |= 1<device_private = view_span.data(); + copy->version = 1; // this version is valid + data->nb_elts = view_span.size(); + data->owner_device = 0; + copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; + std::cout << "copy " << copy << " device_private " << copy->device_private << std::endl; + } + + if (obj_copy != nullptr) { + /* add the */ + obj_copy->add_device_copy(copy); + } + /* register the copy with the task */ + task->parsec_task.data[i].data_in = copy; + task->parsec_task.data[i].source_repo_entry = NULL; + + ++i; + } } - parsec_ttg_caller = NULL; - if (obj->tracing()) { - if constexpr (!ttg::meta::is_void_v) - ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": done executing"); - else - ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : done executing"); + /* mark all other flows as ignored */ + for (; i < MAX_PARAM_COUNT; ++i) { + gpu_task->flow_nb_elts[i] = 0; + flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; + flows[i].flow_index = i; + gpu_task->flow[i] = &flows[i]; + task->parsec_task.data[i].data_in = nullptr; + task->parsec_task.data[i].source_repo_entry = NULL; + } +#endif // 0 + + parsec_device_module_t *device = parsec_mca_device_get(dev_index); + assert(NULL != device); + switch(device->type) { + +#if defined(PARSEC_HAVE_CUDA) + case PARSEC_DEV_CUDA: + if constexpr (Space == ttg::ExecutionSpace::CUDA) { + /* TODO: we need custom staging functions because PaRSEC looks at the + * task-class to determine the number of flows. */ + gpu_task->stage_in = parsec_default_cuda_stage_in; + gpu_task->stage_out = parsec_default_cuda_stage_out; + return parsec_cuda_kernel_scheduler(es, gpu_task, dev_index); + } + break; +#endif + default: + break; + } + ttg::print_error(task->tt->get_name(), " : received mismatching device type ", device->type, " from PaRSEC"); + ttg::abort(); + return PARSEC_HOOK_RETURN_DONE; // will not be reacehed + } + + template + static parsec_hook_return_t static_op(parsec_task_t *parsec_task) { + + task_t *task = (task_t*)parsec_task; + void* suspended_task_address = +#ifdef TTG_HAS_COROUTINE + task->suspended_task_address; // non-null = need to resume the task +#else + nullptr; +#endif + //std::cout << "static_op: suspended_task_address " << suspended_task_address << std::endl; + if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function + + ttT *baseobj = task->tt; + derivedT *obj = static_cast(baseobj); + assert(detail::parsec_ttg_caller == nullptr); + detail::parsec_ttg_caller = static_cast(task); + if (obj->tracing()) { + if constexpr (!ttg::meta::is_void_v) + ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": executing"); + else + ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : executing"); + } + + if constexpr (!ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { + auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence{}); + TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op(task->key, std::move(input), obj->output_terminals)); + } else if constexpr (!ttg::meta::is_void_v && ttg::meta::is_empty_tuple_v) { + TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op(task->key, obj->output_terminals)); + } else if constexpr (ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { + auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence{}); + TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op(std::move(input), obj->output_terminals)); + } else if constexpr (ttg::meta::is_void_v && ttg::meta::is_empty_tuple_v) { + TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op(obj->output_terminals)); + } else { + ttg::abort(); + } + detail::parsec_ttg_caller = nullptr; + } + else { // resume the suspended coroutine + auto coro = static_cast(ttg::device_task_handle_type::from_address(suspended_task_address)); + assert(detail::parsec_ttg_caller == nullptr); + detail::parsec_ttg_caller = static_cast(task); + // TODO: unify the outputs tls handling + auto old_output_tls_ptr = task->tt->outputs_tls_ptr_accessor(); + task->tt->set_outputs_tls_ptr(); + coro.resume(); + if (coro.completed()) { + coro.destroy(); + suspended_task_address = nullptr; + } + task->tt->set_outputs_tls_ptr(old_output_tls_ptr); + detail::parsec_ttg_caller = nullptr; +#if 0 +#ifdef TTG_HAS_COROUTINE + auto ret = static_cast(ttg::coroutine_handle<>::from_address(suspended_task_address)); + assert(ret.ready()); + ret.resume(); + if (ret.completed()) { + ret.destroy(); + suspended_task_address = nullptr; + } + else { // not yet completed + // leave suspended_task_address as is + } + task->suspended_task_address = suspended_task_address; +#else +#endif // 0 + ttg::abort(); // should not happen +#endif + } + task->suspended_task_address = suspended_task_address; + + if (suspended_task_address == nullptr) { + ttT *baseobj = task->tt; + derivedT *obj = static_cast(baseobj); + if (obj->tracing()) { + if constexpr (!ttg::meta::is_void_v) + ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": done executing"); + else + ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : done executing"); + } + } + +// XXX the below code is not needed, should be removed once the fib test has been changed +#if 0 +#ifdef TTG_HAS_COROUTINE + if (suspended_task_address) { + // right now can events are not properly implemented, we are only testing the workflow with dummy events + // so mark the events finished manually, parsec will rerun this task again and it should complete the second time + auto events = static_cast(ttg::coroutine_handle<>::from_address(suspended_task_address)).events(); + for (auto &event_ptr : events) { + event_ptr->finish(); + } + assert(ttg::coroutine_handle<>::from_address(suspended_task_address).promise().ready()); + + // TODO: shove {ptr to parsec_task, ptr to this function} to the list of tasks suspended by this thread (hence stored in TLS) + // thread will loop over its list (after running every task? periodically? need a dedicated queue of ready tasks?) + // and resume the suspended tasks whose events are ready (N.B. ptr to parsec_task is enough to get the list of pending events) + // event clearance will in device case handled by host callbacks run by the dedicated device runtime thread + + // TODO PARSEC_HOOK_RETURN_AGAIN -> PARSEC_HOOK_RETURN_ASYNC when event tracking and task resumption (by this thread) is ready + return PARSEC_HOOK_RETURN_AGAIN; } + else +#endif // TTG_HAS_COROUTINE +#endif // 0 + return PARSEC_HOOK_RETURN_DONE; } template - static void static_op_noarg(parsec_task_t *parsec_task) { + static parsec_hook_return_t static_op_noarg(parsec_task_t *parsec_task) { task_t *task = static_cast(parsec_task); - ttT *baseobj = (ttT *)task->object_ptr; - derivedT *obj = (derivedT *)task->object_ptr; - assert(parsec_ttg_caller == NULL); - parsec_ttg_caller = task; - if constexpr (!ttg::meta::is_void_v) { - baseobj->template op(task->key, obj->output_terminals); - } else if constexpr (ttg::meta::is_void_v) { - baseobj->template op(obj->output_terminals); - } else - abort(); - parsec_ttg_caller = NULL; + + void* suspended_task_address = +#ifdef TTG_HAS_COROUTINE + task->suspended_task_address; // non-null = need to resume the task +#else + nullptr; +#endif + if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function + ttT *baseobj = (ttT *)task->object_ptr; + derivedT *obj = (derivedT *)task->object_ptr; + assert(detail::parsec_ttg_caller == NULL); + detail::parsec_ttg_caller = task; + if constexpr (!ttg::meta::is_void_v) { + TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op(task->key, obj->output_terminals)); + } else if constexpr (ttg::meta::is_void_v) { + TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op(obj->output_terminals)); + } else // unreachable + ttg:: abort(); + detail::parsec_ttg_caller = NULL; + } + else { +#ifdef TTG_HAS_COROUTINE + auto ret = static_cast(ttg::coroutine_handle<>::from_address(suspended_task_address)); + assert(ret.ready()); + ret.resume(); + if (ret.completed()) { + ret.destroy(); + suspended_task_address = nullptr; + } + else { // not yet completed + // leave suspended_task_address as is + } +#else + ttg::abort(); // should not happen +#endif + } + task->suspended_task_address = suspended_task_address; + + if (suspended_task_address) { + ttg::abort(); // not yet implemented + // see comments in static_op() + return PARSEC_HOOK_RETURN_AGAIN; + } + else + return PARSEC_HOOK_RETURN_DONE; } protected: @@ -1372,9 +1627,14 @@ namespace ttg_parsec { } template - uint64_t pack(T &obj, void *bytes, uint64_t pos) { + uint64_t pack(T &obj, void *bytes, uint64_t pos, detail::ttg_data_copy_t *copy = nullptr) { const ttg_data_descriptor *dObj = ttg::get_data_descriptor>(); uint64_t payload_size = dObj->payload_size(&obj); + if (copy) { + /* reset any tracked data, we don't care about the packing from the payload size */ + copy->iovec_reset(); + } + if constexpr (!ttg::default_data_descriptor>::serialize_size_is_const) { const ttg_data_descriptor *dSiz = ttg::get_data_descriptor(); dSiz->pack_payload(&payload_size, sizeof(uint64_t), pos, bytes); @@ -1398,7 +1658,7 @@ namespace ttg_parsec { (obj->*member)(data, size); } else { // there is no good reason to have negative param ids - abort(); + ttg::abort(); } break; } @@ -1424,7 +1684,7 @@ namespace ttg_parsec { break; } default: - abort(); + ttg::abort(); } } @@ -1447,19 +1707,19 @@ namespace ttg_parsec { // TODO: do we need to copy static_stream_goal in dummy? /* set the received value as the dummy's only data */ - dummy->parsec_task.data[0].data_in = copy; + dummy->copies[0] = copy; /* We received the task on this world, so it's using the same taskpool */ dummy->parsec_task.taskpool = world.impl().taskpool(); /* save the current task and set the dummy task */ - auto parsec_ttg_caller_save = parsec_ttg_caller; - parsec_ttg_caller = dummy; + auto parsec_ttg_caller_save = detail::parsec_ttg_caller; + detail::parsec_ttg_caller = dummy; /* iterate over the keys and have them use the copy we made */ parsec_task_t *task_ring = nullptr; for (auto &&key : keylist) { - set_arg_local_impl(key, *reinterpret_cast(copy->device_private), copy, &task_ring); + set_arg_local_impl(key, *reinterpret_cast(copy->get_ptr()), copy, &task_ring); } if (nullptr != task_ring) { @@ -1468,7 +1728,7 @@ namespace ttg_parsec { } /* restore the previous task */ - parsec_ttg_caller = parsec_ttg_caller_save; + detail::parsec_ttg_caller = parsec_ttg_caller_save; /* release the dummy task */ complete_task_and_release(es, &dummy->parsec_task); @@ -1491,7 +1751,9 @@ namespace ttg_parsec { msg_t *msg = static_cast(data); if constexpr (!ttg::meta::is_void_v) { /* unpack the keys */ - uint64_t pos = 0; + /* TODO: can we avoid copying all the keys?! */ + uint64_t pos = msg->tt_id.key_offset; + uint64_t key_end_pos; std::vector keylist; int num_keys = msg->tt_id.num_keys; keylist.reserve(num_keys); @@ -1502,91 +1764,98 @@ namespace ttg_parsec { assert(keymap(key) == rank); keylist.push_back(std::move(key)); } + key_end_pos = pos; + /* jump back to the beginning of the message to get the value */ + pos = 0; // case 1 if constexpr (!ttg::meta::is_void_v) { using decvalueT = std::decay_t; - if constexpr (!ttg::has_split_metadata::value) { - detail::ttg_data_copy_t *copy = detail::create_new_datacopy(decvalueT{}); - unpack(*static_cast(copy->device_private), msg->bytes, pos); - - set_arg_from_msg_keylist(ttg::span(&keylist[0], num_keys), copy); - } else { - /* unpack the header and start the RMA transfers */ + int32_t num_iovecs = msg->tt_id.num_iovecs; + detail::ttg_data_copy_t *copy; + if constexpr (ttg::has_split_metadata::value) { ttg::SplitMetadataDescriptor descr; using metadata_t = decltype(descr.get_metadata(std::declval())); - size_t metadata_size = sizeof(metadata_t); /* unpack the metadata */ metadata_t metadata; - std::memcpy(&metadata, msg->bytes + pos, metadata_size); - pos += metadata_size; + pos = unpack(metadata, msg->bytes, pos); + + copy = detail::create_new_datacopy(descr.create_from_metadata(metadata)); + } else if constexpr (!ttg::has_split_metadata::value) { + copy = detail::create_new_datacopy(decvalueT{}); + /* unpack the object, potentially discovering iovecs */ + pos = unpack(*static_cast(copy->get_ptr()), msg->bytes, pos); + std::cout << "num_iovecs " << num_iovecs << " distance " << std::distance(copy->iovec_begin(), copy->iovec_end()) << std::endl; + assert(std::distance(copy->iovec_begin(), copy->iovec_end()) == num_iovecs); + } - /* unpack the remote rank */ - int remote; - std::memcpy(&remote, msg->bytes + pos, sizeof(remote)); - pos += sizeof(remote); + if (num_iovecs == 0) { + set_arg_from_msg_keylist(ttg::span(&keylist[0], num_keys), copy); + } else { + /* unpack the header and start the RMA transfers */ + /* get the remote rank */ + int remote = msg->tt_id.sender; assert(remote < world.size()); - /* extract the number of chunks */ - int32_t num_iovecs; - std::memcpy(&num_iovecs, msg->bytes + pos, sizeof(num_iovecs)); - pos += sizeof(num_iovecs); - - detail::ttg_data_copy_t *copy = detail::create_new_datacopy(descr.create_from_metadata(metadata)); /* nothing else to do if the object is empty */ - if (0 == num_iovecs) { - set_arg_from_msg_keylist(keylist, copy); - } else { - /* extract the callback tag */ - parsec_ce_tag_t cbtag; - std::memcpy(&cbtag, msg->bytes + pos, sizeof(cbtag)); - pos += sizeof(cbtag); - - /* create the value from the metadata */ - auto activation = new detail::rma_delayed_activate( - std::move(keylist), copy, num_iovecs, [this](std::vector &&keylist, detail::ttg_data_copy_t *copy) { - set_arg_from_msg_keylist(keylist, copy); - this->world.impl().decrement_inflight_msg(); - }); - auto &val = *static_cast(copy->device_private); - - using ActivationT = std::decay_t; - - int nv = 0; - /* process payload iovecs */ - auto iovecs = descr.get_data(val); - /* start the RMA transfers */ - for (auto &&iov : iovecs) { - ++nv; - parsec_ce_mem_reg_handle_t rreg; - int32_t rreg_size_i; - std::memcpy(&rreg_size_i, msg->bytes + pos, sizeof(rreg_size_i)); - pos += sizeof(rreg_size_i); - rreg = static_cast(msg->bytes + pos); - pos += rreg_size_i; - // std::intptr_t *fn_ptr = reinterpret_cast(msg->bytes + pos); - // pos += sizeof(*fn_ptr); - std::intptr_t fn_ptr; - std::memcpy(&fn_ptr, msg->bytes + pos, sizeof(fn_ptr)); - pos += sizeof(fn_ptr); - - /* register the local memory */ - parsec_ce_mem_reg_handle_t lreg; - size_t lreg_size; - parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t, - iov.num_bytes, &lreg, &lreg_size); - world.impl().increment_inflight_msg(); - /* TODO: PaRSEC should treat the remote callback as a tag, not a function pointer! */ - parsec_ce.get(&parsec_ce, lreg, 0, rreg, 0, iov.num_bytes, remote, - &detail::get_complete_cb, activation, - /*world.impl().parsec_ttg_rma_tag()*/ - cbtag, &fn_ptr, sizeof(std::intptr_t)); - } - - assert(num_iovecs == nv); - assert(size == (pos + sizeof(msg_header_t))); + /* extract the callback tag */ + parsec_ce_tag_t cbtag; + std::memcpy(&cbtag, msg->bytes + pos, sizeof(cbtag)); + pos += sizeof(cbtag); + + /* create the value from the metadata */ + auto activation = new detail::rma_delayed_activate( + std::move(keylist), copy, num_iovecs, [this](std::vector &&keylist, detail::ttg_data_copy_t *copy) { + set_arg_from_msg_keylist(keylist, copy); + this->world.impl().decrement_inflight_msg(); + }); + auto &val = *static_cast(copy->get_ptr()); + + using ActivationT = std::decay_t; + + int nv = 0; + /* start the RMA transfers */ + auto handle_iovecs_fn = + [&](auto&& iovecs) { + for (auto &&iov : iovecs) { + ++nv; + parsec_ce_mem_reg_handle_t rreg; + int32_t rreg_size_i; + std::memcpy(&rreg_size_i, msg->bytes + pos, sizeof(rreg_size_i)); + pos += sizeof(rreg_size_i); + rreg = static_cast(msg->bytes + pos); + pos += rreg_size_i; + // std::intptr_t *fn_ptr = reinterpret_cast(msg->bytes + pos); + // pos += sizeof(*fn_ptr); + std::intptr_t fn_ptr; + std::memcpy(&fn_ptr, msg->bytes + pos, sizeof(fn_ptr)); + pos += sizeof(fn_ptr); + + /* register the local memory */ + parsec_ce_mem_reg_handle_t lreg; + size_t lreg_size; + parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t, + iov.num_bytes, &lreg, &lreg_size); + world.impl().increment_inflight_msg(); + /* TODO: PaRSEC should treat the remote callback as a tag, not a function pointer! */ + std::cout << "set_arg_from_msg: get rreg " << rreg << " remote " << remote << std::endl; + parsec_ce.get(&parsec_ce, lreg, 0, rreg, 0, iov.num_bytes, remote, + &detail::get_complete_cb, activation, + /*world.impl().parsec_ttg_rma_tag()*/ + cbtag, &fn_ptr, sizeof(std::intptr_t)); + } + }; + if constexpr (ttg::has_split_metadata::value) { + ttg::SplitMetadataDescriptor descr; + handle_iovecs_fn(descr.get_data(val)); + } else if constexpr (!ttg::has_split_metadata::value) { + handle_iovecs_fn(copy->iovec_span()); + copy->iovec_reset(); } + + assert(num_iovecs == nv); + assert(size == (key_end_pos + sizeof(msg_header_t))); } // case 2 and 3 } else if constexpr (!ttg::meta::is_void_v && std::is_void_v) { @@ -1604,8 +1873,8 @@ namespace ttg_parsec { // case 5 and 6 } else if constexpr (ttg::meta::is_void_v && std::is_void_v) { set_arg(ttg::Void{}); - } else { - abort(); + } else { // unreachable + ttg::abort(); } } @@ -1717,7 +1986,7 @@ namespace ttg_parsec { reinterpret_cast(&TT::static_op); if constexpr (derived_has_cuda_op()) newtask->function_template_class_ptr[static_cast(ttg::ExecutionSpace::CUDA)] = - reinterpret_cast(&TT::static_op); + reinterpret_cast(&TT::device_static_op); for (int i = 0; i < static_stream_goal.size(); ++i) { newtask->stream[i].goal = static_stream_goal[i]; @@ -1790,8 +2059,8 @@ namespace ttg_parsec { if( world_impl.dag_profiling() ) { #if defined(PARSEC_PROF_GRAPHER) - if(NULL != parsec_ttg_caller && !parsec_ttg_caller->dummy()) { - int orig_index = detail::find_index_of_copy_in_task(parsec_ttg_caller, &value); + if(NULL != detail::parsec_ttg_caller && !detail::parsec_ttg_caller->dummy()) { + int orig_index = detail::find_index_of_copy_in_task(detail::parsec_ttg_caller, &value); char orig_str[32]; char dest_str[32]; if(orig_index >= 0) { @@ -1804,7 +2073,7 @@ namespace ttg_parsec { .flow_index = 0, .flow_datatype_mask = ~0 }; parsec_flow_t dest{ .name = dest_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW, .flow_index = 0, .flow_datatype_mask = ~0 }; - parsec_prof_grapher_dep(&parsec_ttg_caller->parsec_task, &task->parsec_task, discover_task ? 1 : 0, &orig, &dest); + parsec_prof_grapher_dep(&detail::parsec_ttg_caller->parsec_task, &task->parsec_task, discover_task ? 1 : 0, &orig, &dest); } #endif } @@ -1817,14 +2086,14 @@ namespace ttg_parsec { if constexpr (!ttg::meta::is_void_v) { // for data values // have a value already? if not, set, otherwise reduce detail::ttg_data_copy_t *copy = nullptr; - if (nullptr == (copy = static_cast(task->parsec_task.data[i].data_in))) { + if (nullptr == (copy = task->copies[i])) { using decay_valueT = std::decay_t; /* For now, we always create a copy because we cannot rely on the task_release * mechanism (it would release the task, not the reduction value). */ copy = detail::create_new_datacopy(std::forward(value)); - task->parsec_task.data[i].data_in = copy; + task->copies[i] = copy; } else { - reducer(*reinterpret_cast *>(copy->device_private), value); + reducer(*reinterpret_cast *>(copy->get_ptr()), value); } } else { reducer(); // even if this was a control input, must execute the reducer for possible side effects @@ -1839,14 +2108,14 @@ namespace ttg_parsec { } else { /* whether the task needs to be deferred or not */ if constexpr (!valueT_is_Void) { - if (nullptr != task->parsec_task.data[i].data_in) { + if (nullptr != task->copies[i]) { ttg::print_error(get_name(), " : ", key, ": error argument is already set : ", i); throw std::logic_error("bad set arg"); } detail::ttg_data_copy_t *copy = copy_in; - if (nullptr == copy_in && nullptr != parsec_ttg_caller) { - copy = detail::find_copy_in_task(parsec_ttg_caller, &value); + if (nullptr == copy_in && nullptr != detail::parsec_ttg_caller) { + copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value); } if (nullptr != copy) { @@ -1855,11 +2124,12 @@ namespace ttg_parsec { } else { copy = detail::create_new_datacopy(std::forward(value)); } + /* if we registered as a writer and were the first to register with this copy * we need to defer the release of this task to give other tasks a chance to * make a copy of the original data */ - release = (copy->push_task != &task->parsec_task); - task->parsec_task.data[i].data_in = copy; + release = (copy->get_next_task() != &task->parsec_task); + task->copies[i] = copy; } } task->remove_from_hash = remove_from_hash; @@ -1947,7 +2217,7 @@ namespace ttg_parsec { // Used to set the i'th argument template - void set_arg_impl(const Key &key, Value &&value) { + void set_arg_impl(const Key &key, Value &&value, detail::ttg_data_copy_t *copy_in = nullptr) { int owner; #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND) @@ -1962,9 +2232,9 @@ namespace ttg_parsec { owner = keymap(); if (owner == world.rank()) { if constexpr (!ttg::meta::is_void_v) - set_arg_local(key, std::forward(value)); + set_arg_local_impl(key, std::forward(value), copy_in); else - set_arg_local(std::forward(value)); + set_arg_local_impl(ttg::Void{}, std::forward(value), copy_in); #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND) if(world.impl().profiling()) { parsec_profiling_ts_trace(world.impl().parsec_ttg_profile_backend_set_arg_end, 0, 0, NULL); @@ -1978,44 +2248,24 @@ namespace ttg_parsec { using msg_t = detail::msg_t; auto &world_impl = world.impl(); uint64_t pos = 0; + int num_iovecs = 0; std::unique_ptr msg = std::make_unique(get_instance_id(), world_impl.taskpool()->taskpool_id, - msg_header_t::MSG_SET_ARG, i, 1); + msg_header_t::MSG_SET_ARG, i, world_impl.rank(), 1); using decvalueT = std::decay_t; - /* pack the key */ - msg->tt_id.num_keys = 0; - if constexpr (!ttg::meta::is_void_v) { - pos = pack(key, msg->bytes, pos); - msg->tt_id.num_keys = 1; - } if constexpr (!ttg::meta::is_void_v) { - if constexpr (!ttg::has_split_metadata::value) { - pos = pack(value, msg->bytes, pos); - } else { - detail::ttg_data_copy_t *copy; - copy = detail::find_copy_in_task(parsec_ttg_caller, &value); + + detail::ttg_data_copy_t *copy = copy_in; + /* make sure we have a data copy to register with */ + if (nullptr == copy) { + copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value); if (nullptr == copy) { // We need to create a copy for this data, as it does not exist yet. copy = detail::create_new_datacopy(std::forward(value)); } - copy = detail::register_data_copy(copy, nullptr, true); - - ttg::SplitMetadataDescriptor descr; - auto metadata = descr.get_metadata(value); - size_t metadata_size = sizeof(metadata); - /* pack the metadata */ - std::memcpy(msg->bytes + pos, &metadata, metadata_size); - pos += metadata_size; - /* pack the local rank */ - int rank = world.rank(); - std::memcpy(msg->bytes + pos, &rank, sizeof(rank)); - pos += sizeof(rank); - - auto iovecs = descr.get_data(*static_cast(copy->device_private)); + } - int32_t num_iovs = std::distance(std::begin(iovecs), std::end(iovecs)); - std::memcpy(msg->bytes + pos, &num_iovs, sizeof(num_iovs)); - pos += sizeof(num_iovs); + auto handle_iovec_fn = [&](auto&& iovecs){ /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a * raw function pointer instead of a preregistered AM tag, so play that game. @@ -2029,6 +2279,7 @@ namespace ttg_parsec { * memory layout: [, ...] */ for (auto &&iov : iovecs) { + copy = detail::register_data_copy(copy, nullptr, true); parsec_ce_mem_reg_handle_t lreg; size_t lreg_size; /* TODO: only register once when we can broadcast the data! */ @@ -2041,8 +2292,9 @@ namespace ttg_parsec { int32_t lreg_size_i = lreg_size; std::memcpy(msg->bytes + pos, &lreg_size_i, sizeof(lreg_size_i)); pos += sizeof(lreg_size_i); - std::memcpy(msg->bytes + pos, lreg, lreg_size_i); - pos += lreg_size_i; + std::memcpy(msg->bytes + pos, lreg, lreg_size); + pos += lreg_size; + std::cout << "set_arg_impl lreg " << lreg << std::endl; /* TODO: can we avoid the extra indirection of going through std::function? */ std::function *fn = new std::function([=]() mutable { /* shared_ptr of value and registration captured by value so resetting @@ -2054,12 +2306,44 @@ namespace ttg_parsec { std::memcpy(msg->bytes + pos, &fn_ptr, sizeof(fn_ptr)); pos += sizeof(fn_ptr); } + }; + + if constexpr (ttg::has_split_metadata>::value) { + ttg::SplitMetadataDescriptor descr; + auto iovs = descr.get_data(*const_cast(&value)); + num_iovecs = std::distance(std::begin(iovs), std::end(iovs)); + /* pack the metadata */ + auto metadata = descr.get_metadata(value); + size_t metadata_size = sizeof(metadata); + pos = pack(metadata, msg->bytes, pos); + handle_iovec_fn(iovs); + } else if constexpr (!ttg::has_split_metadata>::value) { + /* serialize the object */ + std::cout << "PRE pack num_iovecs " << std::distance(copy->iovec_begin(), copy->iovec_end()) << std::endl; + pos = pack(value, msg->bytes, pos, copy); + num_iovecs = std::distance(copy->iovec_begin(), copy->iovec_end()); + std::cout << "POST pack num_iovecs " << num_iovecs << std::endl; + /* handle any iovecs contained in it */ + handle_iovec_fn(copy->iovec_span()); + copy->iovec_reset(); } + + msg->tt_id.num_iovecs = num_iovecs; } + + /* pack the key */ + msg->tt_id.num_keys = 0; + msg->tt_id.key_offset = pos; + if constexpr (!ttg::meta::is_void_v) { + size_t tmppos = pack(key, msg->bytes, pos); + pos = tmppos; + msg->tt_id.num_keys = 1; + } + parsec_taskpool_t *tp = world_impl.taskpool(); tp->tdm.module->outgoing_message_start(tp, owner, NULL); tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0); - // std::cout << "Sending AM with " << msg->op_id.num_keys << " keys " << std::endl; + std::cout << "set_arg_impl send_am owner " << owner << " sender " << msg->tt_id.sender << std::endl; parsec_ce.send_am(&parsec_ce, world_impl.parsec_ttg_tag(), owner, static_cast(msg.get()), sizeof(msg_header_t) + pos); #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND) @@ -2068,8 +2352,8 @@ namespace ttg_parsec { } #endif #if defined(PARSEC_PROF_GRAPHER) - if(NULL != parsec_ttg_caller && !parsec_ttg_caller->dummy()) { - int orig_index = detail::find_index_of_copy_in_task(parsec_ttg_caller, &value); + if(NULL != detail::parsec_ttg_caller && !detail::parsec_ttg_caller->dummy()) { + int orig_index = detail::find_index_of_copy_in_task(detail::parsec_ttg_caller, &value); char orig_str[32]; char dest_str[32]; if(orig_index >= 0) { @@ -2083,7 +2367,7 @@ namespace ttg_parsec { parsec_flow_t dest{ .name = dest_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW, .flow_index = 0, .flow_datatype_mask = ~0 }; task_t *task = create_new_task(key); - parsec_prof_grapher_dep(&parsec_ttg_caller->parsec_task, &task->parsec_task, 0, &orig, &dest); + parsec_prof_grapher_dep(&detail::parsec_ttg_caller->parsec_task, &task->parsec_task, 0, &orig, &dest); delete task; } #endif @@ -2098,8 +2382,8 @@ namespace ttg_parsec { #endif parsec_task_t *task_ring = nullptr; detail::ttg_data_copy_t *copy = nullptr; - if (nullptr != parsec_ttg_caller) { - copy = detail::find_copy_in_task(parsec_ttg_caller, &value); + if (nullptr != detail::parsec_ttg_caller) { + copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value); } for (auto it = begin; it != end; ++it) { @@ -2117,84 +2401,13 @@ namespace ttg_parsec { } template - std::enable_if_t && !std::is_void_v> && - !ttg::has_split_metadata>::value, + std::enable_if_t && !std::is_void_v>, void> broadcast_arg(const ttg::span &keylist, const Value &value) { - auto world = ttg_default_execution_context(); - int rank = world.rank(); - - bool have_remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(), - [&](const Key &key) { return keymap(key) != rank; }); - - if (have_remote) { - std::vector keylist_sorted(keylist.begin(), keylist.end()); - - /* Assuming there are no local keys, will be updated while processing remote keys */ - auto local_begin = keylist_sorted.end(); - auto local_end = keylist_sorted.end(); - - /* sort the input key list by owner and check whether there are remote keys */ - std::sort(keylist_sorted.begin(), keylist_sorted.end(), [&](const Key &a, const Key &b) mutable { - int rank_a = keymap(a); - int rank_b = keymap(b); - return rank_a < rank_b; - }); - - using msg_t = detail::msg_t; - local_begin = keylist_sorted.end(); - auto &world_impl = world.impl(); - std::unique_ptr msg = std::make_unique(get_instance_id(), world_impl.taskpool()->taskpool_id, - msg_header_t::MSG_SET_ARG, i); - - parsec_taskpool_t *tp = world_impl.taskpool(); - - for (auto it = keylist_sorted.begin(); it < keylist_sorted.end(); /* increment inline */) { - auto owner = keymap(*it); - if (owner == rank) { - /* make sure we don't lose local keys */ - local_begin = it; - local_end = - std::find_if_not(++it, keylist_sorted.end(), [&](const Key &key) { return keymap(key) == rank; }); - it = local_end; - continue; - } - - /* pack all keys for this owner */ - int num_keys = 0; - uint64_t pos = 0; - do { - ++num_keys; - pos = pack(*it, msg->bytes, pos); - ++it; - } while (it < keylist_sorted.end() && keymap(*it) == owner); - msg->tt_id.num_keys = num_keys; - - /* TODO: use RMA to transfer the value */ - pos = pack(value, msg->bytes, pos); - - /* Send the message */ - tp->tdm.module->outgoing_message_start(tp, owner, NULL); - tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0); - parsec_ce.send_am(&parsec_ce, world_impl.parsec_ttg_tag(), owner, static_cast(msg.get()), - sizeof(msg_header_t) + pos); - } - /* handle local keys */ - broadcast_arg_local(local_begin, local_end, value); - } else { - /* only local keys */ - broadcast_arg_local(keylist.begin(), keylist.end(), value); - } - } - - template - std::enable_if_t && !std::is_void_v> && - ttg::has_split_metadata>::value, - void> - splitmd_broadcast_arg(const ttg::span &keylist, const Value &value) { using valueT = std::tuple_element_t; auto world = ttg_default_execution_context(); int rank = world.rank(); + uint64_t pos = 0; bool have_remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(), [&](const Key &key) { return keymap(key) != rank; }); @@ -2213,41 +2426,69 @@ namespace ttg_parsec { auto local_begin = keylist_sorted.end(); auto local_end = keylist_sorted.end(); - ttg::SplitMetadataDescriptor descr; - auto iovs = descr.get_data(*const_cast(&value)); - int32_t num_iovs = std::distance(std::begin(iovs), std::end(iovs)); + int32_t num_iovs = 0; + + detail::ttg_data_copy_t *copy; + copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value); + assert(nullptr != copy); + std::vector>> memregs; - memregs.reserve(num_iovs); - - /* register all iovs so the registration can be reused */ - for (auto &&iov : iovs) { - parsec_ce_mem_reg_handle_t lreg; - size_t lreg_size; - parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t, - iov.num_bytes, &lreg, &lreg_size); - /* TODO: use a static function for deregistration here? */ - memregs.push_back(std::make_pair(static_cast(lreg_size), - /* TODO: this assumes that parsec_ce_mem_reg_handle_t is void* */ - std::shared_ptr{lreg, [](void *ptr) { - parsec_ce_mem_reg_handle_t memreg = - (parsec_ce_mem_reg_handle_t)ptr; - parsec_ce.mem_unregister(&memreg); - }})); - } + auto register_iovs_fn = [&memregs](auto&& iovs){ + for (auto &&iov : iovs) { + parsec_ce_mem_reg_handle_t lreg; + size_t lreg_size; + parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t, + iov.num_bytes, &lreg, &lreg_size); + /* TODO: use a static function for deregistration here? */ + memregs.push_back(std::make_pair(static_cast(lreg_size), + /* TODO: this assumes that parsec_ce_mem_reg_handle_t is void* */ + std::shared_ptr{lreg, [](void *ptr) { + parsec_ce_mem_reg_handle_t memreg = + (parsec_ce_mem_reg_handle_t)ptr; + std::cout << "broadcast_arg memunreg lreg " << memreg << std::endl; + parsec_ce.mem_unregister(&memreg); + }})); + std::cout << "broadcast_arg memreg lreg " << lreg << std::endl; + } + }; using msg_t = detail::msg_t; auto &world_impl = world.impl(); std::unique_ptr msg = std::make_unique(get_instance_id(), world_impl.taskpool()->taskpool_id, - msg_header_t::MSG_SET_ARG, i); - auto metadata = descr.get_metadata(value); - size_t metadata_size = sizeof(metadata); + msg_header_t::MSG_SET_ARG, i, world_impl.rank()); - detail::ttg_data_copy_t *copy; - copy = detail::find_copy_in_task(parsec_ttg_caller, &value); - assert(nullptr != copy); + if constexpr (ttg::has_split_metadata>::value) { + ttg::SplitMetadataDescriptor descr; + auto iovs = descr.get_data(*const_cast(&value)); + num_iovs = std::distance(std::begin(iovs), std::end(iovs)); + memregs.reserve(num_iovs); + register_iovs_fn(iovs); + /* pack the metadata */ + auto metadata = descr.get_metadata(value); + size_t metadata_size = sizeof(metadata); + pos = pack(metadata, msg->bytes, pos); + } else if constexpr (!ttg::has_split_metadata>::value) { + /* serialize the object once */ + pos = pack(value, msg->bytes, pos, copy); + num_iovs = std::distance(copy->iovec_begin(), copy->iovec_end()); + register_iovs_fn(copy->iovec_span()); + copy->iovec_reset(); + } + + /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a + * raw function pointer instead of a preregistered AM tag, so play that game. + * Once this is fixed in PaRSEC we need to use parsec_ttg_rma_tag instead! */ + parsec_ce_tag_t cbtag = reinterpret_cast(&detail::get_remote_complete_cb); + std::memcpy(msg->bytes + pos, &cbtag, sizeof(cbtag)); + pos += sizeof(cbtag); + + msg->tt_id.num_iovecs = num_iovs; + + std::size_t save_pos = pos; parsec_taskpool_t *tp = world_impl.taskpool(); for (auto it = keylist_sorted.begin(); it < keylist_sorted.end(); /* increment done inline */) { + auto owner = keymap(*it); if (owner == rank) { local_begin = it; @@ -2258,41 +2499,14 @@ namespace ttg_parsec { continue; } - /* count keys and set it afterwards */ - uint64_t pos = 0; - /* pack all keys for this owner */ - int num_keys = 0; - do { - ++num_keys; - pos = pack(*it, msg->bytes, pos); - ++it; - } while (it < keylist_sorted.end() && keymap(*it) == owner); - msg->tt_id.num_keys = num_keys; - - /* pack the metadata */ - std::memcpy(msg->bytes + pos, &metadata, metadata_size); - pos += metadata_size; - /* pack the local rank */ - int rank = world.rank(); - std::memcpy(msg->bytes + pos, &rank, sizeof(rank)); - pos += sizeof(rank); - /* pack the number of iovecs */ - std::memcpy(msg->bytes + pos, &num_iovs, sizeof(num_iovs)); - pos += sizeof(num_iovs); - - /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a - * raw function pointer instead of a preregistered AM tag, so play that game. - * Once this is fixed in PaRSEC we need to use parsec_ttg_rma_tag instead! */ - parsec_ce_tag_t cbtag = reinterpret_cast(&detail::get_remote_complete_cb); - std::memcpy(msg->bytes + pos, &cbtag, sizeof(cbtag)); - pos += sizeof(cbtag); - + /* rewind the buffer and start packing a new set of memregs and keys */ + pos = save_pos; /** * pack the registration handles * memory layout: [, ...] + * NOTE: we need to pack these for every receiver to ensure correct ref-counting of the registration */ - int idx = 0; - for (auto &&iov : iovs) { + for (int idx = 0; idx < num_iovs; ++idx) { // auto [lreg_size, lreg_ptr] = memregs[idx]; int32_t lreg_size; std::shared_ptr lreg_ptr; @@ -2301,23 +2515,36 @@ namespace ttg_parsec { pos += sizeof(lreg_size); std::memcpy(msg->bytes + pos, lreg_ptr.get(), lreg_size); pos += lreg_size; - /* create a function that will be invoked upon RMA completion at the target */ - std::shared_ptr lreg_ptr_v = lreg_ptr; + std::cout << "broadcast_arg lreg_ptr " << lreg_ptr.get() << std::endl; /* mark another reader on the copy */ copy = detail::register_data_copy(copy, nullptr, true); + /* create a function that will be invoked upon RMA completion at the target */ std::function *fn = new std::function([=]() mutable { /* shared_ptr of value and registration captured by value so resetting - * them here will eventually release the memory/registration */ + * them here will eventually release the memory/registration */ detail::release_data_copy(copy); - lreg_ptr_v.reset(); + lreg_ptr.reset(); }); std::intptr_t fn_ptr{reinterpret_cast(fn)}; std::memcpy(msg->bytes + pos, &fn_ptr, sizeof(fn_ptr)); pos += sizeof(fn_ptr); - ++idx; } + + /* mark the beginning of the keys */ + msg->tt_id.key_offset = pos; + + /* pack all keys for this owner */ + int num_keys = 0; + do { + ++num_keys; + pos = pack(*it, msg->bytes, pos); + ++it; + } while (it < keylist_sorted.end() && keymap(*it) == owner); + msg->tt_id.num_keys = num_keys; + tp->tdm.module->outgoing_message_start(tp, owner, NULL); tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0); + std::cout << "broadcast_arg send_am owner " << owner << std::endl; parsec_ce.send_am(&parsec_ce, world_impl.parsec_ttg_tag(), owner, static_cast(msg.get()), sizeof(msg_header_t) + pos); } @@ -2406,10 +2633,10 @@ namespace ttg_parsec { auto &world_impl = world.impl(); uint64_t pos = 0; std::unique_ptr msg = std::make_unique(get_instance_id(), world_impl.taskpool()->taskpool_id, - msg_header_t::MSG_SET_ARGSTREAM_SIZE, i, 1); + msg_header_t::MSG_SET_ARGSTREAM_SIZE, i, + world_impl.rank(), 1); /* pack the key */ pos = pack(key, msg->bytes, pos); - msg->tt_id.num_keys = 1; pos = pack(size, msg->bytes, pos); parsec_taskpool_t *tp = world_impl.taskpool(); tp->tdm.module->outgoing_message_start(tp, owner, NULL); @@ -2461,9 +2688,8 @@ namespace ttg_parsec { auto &world_impl = world.impl(); uint64_t pos = 0; std::unique_ptr msg = std::make_unique(get_instance_id(), world_impl.taskpool()->taskpool_id, - msg_header_t::MSG_SET_ARGSTREAM_SIZE, i, 1); - /* pack the key */ - msg->tt_id.num_keys = 0; + msg_header_t::MSG_SET_ARGSTREAM_SIZE, i, + world_impl.rank(), 0); pos = pack(size, msg->bytes, pos); parsec_taskpool_t *tp = world_impl.taskpool(); tp->tdm.module->outgoing_message_start(tp, owner, NULL); @@ -2513,10 +2739,10 @@ namespace ttg_parsec { auto &world_impl = world.impl(); uint64_t pos = 0; std::unique_ptr msg = std::make_unique(get_instance_id(), world_impl.taskpool()->taskpool_id, - msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i, 1); + msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i, + world_impl.rank(), 1); /* pack the key */ pos = pack(key, msg->bytes, pos); - msg->tt_id.num_keys = 1; parsec_taskpool_t *tp = world_impl.taskpool(); tp->tdm.module->outgoing_message_start(tp, owner, NULL); tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0); @@ -2559,8 +2785,8 @@ namespace ttg_parsec { auto &world_impl = world.impl(); uint64_t pos = 0; std::unique_ptr msg = std::make_unique(get_instance_id(), world_impl.taskpool()->taskpool_id, - msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i, 1); - msg->tt_id.num_keys = 0; + msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i, + world_impl.rank(), 0); parsec_taskpool_t *tp = world_impl.taskpool(); tp->tdm.module->outgoing_message_start(tp, owner, NULL); tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0); @@ -2589,6 +2815,84 @@ namespace ttg_parsec { } } + void copy_mark_pushout(detail::ttg_data_copy_t *copy) { + + assert(detail::parsec_ttg_caller->dev_ptr && detail::parsec_ttg_caller->dev_ptr->gpu_task); + parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task; + for (auto data : *copy) { + if (data->owner_device != 0 && + data->device_copies[0]->version < data->device_copies[data->owner_device]->version) { + /* find the flow */ + int flowidx = 0; + while (flowidx < MAX_PARAM_COUNT && + gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) { + if (detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in->original == data) { + /* found the right data, set the corresponding flow as pushout */ + break; + } + ++flowidx; + } + if (flowidx == MAX_PARAM_COUNT) { + throw std::runtime_error("Cannot add more than MAX_PARAM_COUNT flows to a task!"); + } + if (gpu_task->flow[flowidx]->flow_flags == PARSEC_FLOW_ACCESS_NONE) { + /* no flow found, add one and mark it pushout */ + detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in = data->device_copies[0]; + ((parsec_flow_t *)gpu_task->flow[flowidx])->flow_flags |= PARSEC_FLOW_ACCESS_RW; + gpu_task->flow_nb_elts[flowidx] = data->nb_elts; + } + gpu_task->pushout |= 1< + std::enable_if_t && !std::is_void_v>, + void> + prepare_send(const ttg::span &keylist, const Value &value) { + using valueT = std::tuple_element_t; + + /* get the copy */ + detail::ttg_data_copy_t *copy; + copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value); + assert(nullptr != copy); + + /* check if there are non-local successors if it's a cuda task */ + bool have_remote = false; + if constexpr (derived_has_cuda_op()) { + auto world = ttg_default_execution_context(); + int rank = world.rank(); + uint64_t pos = 0; + bool have_remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(), + [&](const Key &key) { return keymap(key) != rank; }); + } + if (!derived_has_cuda_op() || have_remote) { + /* non-gpu task, check if we need to push back to the host */ + copy_mark_pushout(copy); + } + } + + template + std::enable_if_t && !std::is_void_v>, + void> + prepare_send(const Value &value) { + using valueT = std::tuple_element_t; + + /* get the copy */ + detail::ttg_data_copy_t *copy; + copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value); + assert(nullptr != copy); + + /* check if there are non-local successors if it's a cuda task */ + auto world = ttg_default_execution_context(); + int rank = world.rank(); + bool have_remote = (keymap() != rank); + if (!derived_has_cuda_op() || have_remote) { + /* non-gpu task, check if we need to push back to the host */ + copy_mark_pushout(copy); + } + } + private: // Copy/assign/move forbidden ... we could make it work using // PIMPL for this base class. However, this instance of the base @@ -2622,15 +2926,15 @@ namespace ttg_parsec { set_arg(key, value); }; auto broadcast_callback = [this](const ttg::span &keylist, const valueT &value) { - if constexpr (ttg::has_split_metadata>::value) { - splitmd_broadcast_arg(keylist, value); - } else { broadcast_arg(keylist, value); - } + }; + auto prepare_send_callback = [this](const ttg::span &keylist, const valueT &value) { + prepare_send(keylist, value); }; auto setsize_callback = [this](const keyT &key, std::size_t size) { set_argstream_size(key, size); }; auto finalize_callback = [this](const keyT &key) { finalize_argstream(key); }; - input.set_callback(send_callback, move_callback, broadcast_callback, setsize_callback, finalize_callback); + input.set_callback(send_callback, move_callback, broadcast_callback, + setsize_callback, finalize_callback, prepare_send_callback); } ////////////////////////////////////////////////////////////////// // case 2: nonvoid key, void value, mixed inputs @@ -2653,7 +2957,10 @@ namespace ttg_parsec { auto send_callback = [this](const valueT &value) { set_arg(value); }; auto setsize_callback = [this](std::size_t size) { set_argstream_size(size); }; auto finalize_callback = [this]() { finalize_argstream(); }; - input.set_callback(send_callback, move_callback, {}, setsize_callback, finalize_callback); + auto prepare_send_callback = [this](const valueT &value) { + prepare_send(value); + }; + input.set_callback(send_callback, move_callback, {}, setsize_callback, finalize_callback, prepare_send_callback); } ////////////////////////////////////////////////////////////////// // case 5: void key, void value, mixed inputs @@ -2669,7 +2976,7 @@ namespace ttg_parsec { // NOTE: subsumed in case 5 above, kept for historical reasons ////////////////////////////////////////////////////////////////// else - abort(); + ttg::abort(); } template @@ -2775,17 +3082,62 @@ namespace ttg_parsec { return buffer; } +#if defined(PARSEC_PROF_TRACE) + static void *parsec_ttg_task_info(void *dst, const void *data, size_t size) + { + const parsec_task_t *t = reinterpret_cast(data); + + if constexpr (ttg::meta::is_void_v) { + snprintf(reinterpret_cast(dst), size, "()"); + } else { + // we use the locals array as a scratchpad to store the hash of the key and its actual address + // locals[0] amd locals[1] hold the hash, while locals[2] and locals[3] hold the key pointer + keyT *key = *(keyT**)&(t->locals[2]); + std::stringstream ss; + ss << *key; + + std::string keystr = ss.str(); + snprintf(reinterpret_cast(dst), size, "%s", keystr.c_str()); + } + return dst; + } +#endif + parsec_key_fn_t tasks_hash_fcts = {key_equal, key_print, key_hash}; - static parsec_hook_return_t complete_task_and_release(parsec_execution_stream_t *es, parsec_task_t *t) { + static parsec_hook_return_t complete_task_and_release(parsec_execution_stream_t *es, parsec_task_t *parsec_task) { parsec_execution_stream_t *safe_es = parsec_ttg_es; parsec_ttg_es = es; - auto *task = (detail::parsec_ttg_task_base_t *)t; + + //std::cout << "complete_task_and_release: task " << parsec_task << std::endl; + + task_t *task = (task_t*)parsec_task; + + /* if we still have a coroutine handle we invoke it one more time to get the sends/broadcasts */ + if (task->suspended_task_address) { + // get the device task from the coroutine handle + auto dev_task = ttg::device_task_handle_type::from_address(task->suspended_task_address); + + // get the promise which contains the views + auto dev_data = dev_task.promise(); + + /* for now make sure we're waiting for the kernel to complete and the coro hasn't skipped this step */ + assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_KERNEL); + + /* the kernel has completed, resume the coroutine once again to get to the send stage */ + /* TODO: how can we get the execution space here? */ + static_op(parsec_task); + + /* the coroutine should have completed and we cannot access the promise anymore */ + task->suspended_task_address = nullptr; + } + + /* release our data copies */ for (int i = 0; i < task->data_count; i++) { - detail::ttg_data_copy_t *copy = static_cast(task->parsec_task.data[i].data_in); + detail::ttg_data_copy_t *copy = task->copies[i]; if (nullptr == copy) continue; detail::release_data_copy(copy); - task->parsec_task.data[i].data_in = nullptr; + task->copies[i] = nullptr; } parsec_ttg_es = safe_es; return PARSEC_HOOK_RETURN_DONE; @@ -2828,7 +3180,9 @@ namespace ttg_parsec { self.task_class_id = get_instance_id(); self.nb_parameters = 0; self.nb_locals = 0; - self.nb_flows = numflows; + //self.nb_flows = numflows; + self.nb_flows = MAX_PARAM_COUNT; // we're not using all flows but have to + // trick the device handler into looking at all of them if( world_impl.profiling() ) { // first two ints are used to store the hash of the key. @@ -2849,9 +3203,14 @@ namespace ttg_parsec { self.key_functions = &tasks_hash_fcts; self.task_snprintf = parsec_ttg_task_snprintf; +#if defined(PARSEC_PROF_TRACE) + self.profile_info = &parsec_ttg_task_info; +#endif + world_impl.taskpool()->nb_task_classes = std::max(world_impl.taskpool()->nb_task_classes, static_castnb_task_classes)>(self.task_class_id+1)); // function_id_to_instance[self.task_class_id] = this; - + //self.incarnations = incarnations_array.data(); +//#if 0 if constexpr (derived_has_cuda_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CUDA; @@ -2872,11 +3231,12 @@ namespace ttg_parsec { ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; ((__parsec_chore_t *)self.incarnations)[1].hook = NULL; } +//#endif // 0 self.release_task = &parsec_release_task_to_mempool_update_nbtasks; self.complete_execution = complete_task_and_release; - for (i = 0; i < numins; i++) { + for (i = 0; i < MAX_PARAM_COUNT; i++) { parsec_flow_t *flow = new parsec_flow_t; flow->name = strdup((std::string("flow in") + std::to_string(i)).c_str()); flow->sym_type = PARSEC_SYM_INOUT; @@ -2885,13 +3245,13 @@ namespace ttg_parsec { flow->dep_in[0] = NULL; flow->dep_out[0] = NULL; flow->flow_index = i; - flow->flow_datatype_mask = (1 << i); + flow->flow_datatype_mask = ~0; *((parsec_flow_t **)&(self.in[i])) = flow; } - *((parsec_flow_t **)&(self.in[i])) = NULL; - initialize_flows(self.in); + //*((parsec_flow_t **)&(self.in[i])) = NULL; + //initialize_flows(self.in); - for (i = 0; i < numouts; i++) { + for (i = 0; i < MAX_PARAM_COUNT; i++) { parsec_flow_t *flow = new parsec_flow_t; flow->name = strdup((std::string("flow out") + std::to_string(i)).c_str()); flow->sym_type = PARSEC_SYM_INOUT; @@ -2902,7 +3262,7 @@ namespace ttg_parsec { flow->flow_datatype_mask = (1 << i); *((parsec_flow_t **)&(self.out[i])) = flow; } - *((parsec_flow_t **)&(self.out[i])) = NULL; + //*((parsec_flow_t **)&(self.out[i])) = NULL; self.flags = 0; self.dependencies_goal = numins; /* (~(uint32_t)0) >> (32 - numins); */ @@ -3085,6 +3445,49 @@ namespace ttg_parsec { TTBase::invoke(); } + private: + template + void invoke_arglist(std::index_sequence, const Key& key, Arg&& arg, Args&&... args) { + using arg_type = std::decay_t; + if constexpr (ttg::detail::is_ptr_v) { + /* add a reference to the object */ + auto copy = ttg_parsec::detail::get_copy(arg); + copy->add_ref(); + /* reset readers so that the value can flow without copying */ + copy->reset_readers(); + auto& val = *arg; + set_arg_impl(key, val, copy); + ttg_parsec::detail::release_data_copy(copy); + if constexpr (std::is_rvalue_reference_v) { + /* if the ptr was moved in we reset it */ + arg.reset(); + } + } else if constexpr (!ttg::detail::is_ptr_v) { + set_arg(key, std::forward(arg)); + } + if constexpr (sizeof...(Is) > 0) { + /* recursive next argument */ + invoke_arglist(std::index_sequence{}, key, std::forward(args)...); + } + } + + public: + // Manual injection of a task with all input arguments specified as variadic arguments + template + std::enable_if_t && !ttg::meta::is_empty_tuple_v, void> invoke( + const Key &key, Arg&& arg, Args&&... args) { + static_assert(sizeof...(Args)+1 == std::tuple_size_v, + "Number of arguments to invoke must match the number of task inputs."); + TTG_OP_ASSERT_EXECUTABLE(); + /* trigger non-void inputs */ + invoke_arglist(ttg::meta::nonvoid_index_seq{}, key, + std::forward(arg), std::forward(args)...); + //set_args(ttg::meta::nonvoid_index_seq{}, key, args); + /* trigger void inputs */ + using void_index_seq = ttg::meta::void_index_seq; + set_args(void_index_seq{}, key, ttg::detail::make_void_tuple()); + } + void set_defer_writer(bool value) { m_defer_writer = value; } @@ -3164,6 +3567,71 @@ namespace ttg_parsec { #include "ttg/make_tt.h" + namespace device { + + class DeviceAllocator { + private: + int ttg_did, parsec_did; + struct ::zone_malloc_s *zone; + ::ttg::ExecutionSpace exec_space; + public: + DeviceAllocator(int did); + void *allocate(std::size_t size); + void free(void *ptr); + ::ttg::ExecutionSpace executionSpace(); + }; + + DeviceAllocator::DeviceAllocator(int did) : ttg_did(-1), parsec_did(-1), zone(nullptr), exec_space(::ttg::ExecutionSpace::Invalid) { + for(int i = 0; i < parsec_nb_devices; i++) { + parsec_device_module_t *m = parsec_mca_device_get(i); + if(m->type == PARSEC_DEV_CPU || m->type == PARSEC_DEV_CUDA) { + if(did == 0) { + parsec_did = i; + ttg_did = did; + if(m->type == PARSEC_DEV_CUDA) { + parsec_device_gpu_module_t *gm = reinterpret_cast(m); + zone = gm->memory; + exec_space = ::ttg::ExecutionSpace::CUDA; + } else { + exec_space = ::ttg::ExecutionSpace::Host; + } + return; + } + did--; + } + } + throw std::out_of_range("Device identifier is out of range"); + } + + void *DeviceAllocator::allocate(std::size_t size) { + if(nullptr == zone) return malloc(size); + return zone_malloc(zone, size); + } + + void DeviceAllocator::free(void *ptr) { + if(nullptr == zone) { + free(ptr); + return; + } + zone_free(zone, ptr); + } + + ::ttg::ExecutionSpace DeviceAllocator::executionSpace() { + return exec_space; + } + + std::size_t nb_devices() { + std::size_t nb = 0; + for(int i = 0; i < parsec_nb_devices; i++) { + parsec_device_module_t *m = parsec_mca_device_get(i); + if(m->type == PARSEC_DEV_CPU || m->type == PARSEC_DEV_CUDA) { + nb++; + } + } + return nb; + } + } // namespace ttg_parsec::device + } // namespace ttg_parsec /** @@ -3177,30 +3645,49 @@ struct ttg::detail::value_copy_handler { ttg_parsec::detail::ttg_data_copy_t *copy_to_remove = nullptr; public: + value_copy_handler() = default; + value_copy_handler(const value_copy_handler& h) = delete; + value_copy_handler(value_copy_handler&& h) + : copy_to_remove(h.copy_to_remove) + { + h.copy_to_remove = nullptr; + } + + value_copy_handler& operator=(const value_copy_handler& h) = delete; + value_copy_handler& operator=(value_copy_handler&& h) + { + std::swap(copy_to_remove, h.copy_to_remove); + return *this; + } + ~value_copy_handler() { if (nullptr != copy_to_remove) { - ttg_parsec::detail::remove_data_copy(copy_to_remove, parsec_ttg_caller); + ttg_parsec::detail::remove_data_copy(copy_to_remove, ttg_parsec::detail::parsec_ttg_caller); ttg_parsec::detail::release_data_copy(copy_to_remove); } } template inline Value &&operator()(Value &&value) { - if (nullptr == parsec_ttg_caller) { + static_assert(std::is_rvalue_reference_v || + std::is_copy_constructible_v>, + "Data sent without being moved must be copy-constructible!"); + if (nullptr == ttg_parsec::detail::parsec_ttg_caller) { ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n"); } + using value_type = std::remove_reference_t; ttg_parsec::detail::ttg_data_copy_t *copy; - copy = ttg_parsec::detail::find_copy_in_task(parsec_ttg_caller, &value); - Value *value_ptr = &value; + copy = ttg_parsec::detail::find_copy_in_task(ttg_parsec::detail::parsec_ttg_caller, &value); + value_type *value_ptr = &value; if (nullptr == copy) { /** * the value is not known, create a copy that we can track * depending on Value, this uses either the copy or move constructor */ copy = ttg_parsec::detail::create_new_datacopy(std::forward(value)); - bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller); + bool inserted = ttg_parsec::detail::add_copy_to_task(copy, ttg_parsec::detail::parsec_ttg_caller); assert(inserted); - value_ptr = reinterpret_cast(copy->device_private); + value_ptr = reinterpret_cast(copy->get_ptr()); copy_to_remove = copy; } else { /* this copy won't be modified anymore so mark it as read-only */ @@ -3211,11 +3698,13 @@ struct ttg::detail::value_copy_handler { template inline const Value &operator()(const Value &value) { - if (nullptr == parsec_ttg_caller) { + static_assert(std::is_copy_constructible_v>, + "Data sent without being moved must be copy-constructible!"); + if (nullptr == ttg_parsec::detail::parsec_ttg_caller) { ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n"); } ttg_parsec::detail::ttg_data_copy_t *copy; - copy = ttg_parsec::detail::find_copy_in_task(parsec_ttg_caller, &value); + copy = ttg_parsec::detail::find_copy_in_task(ttg_parsec::detail::parsec_ttg_caller, &value); const Value *value_ptr = &value; if (nullptr == copy) { /** @@ -3223,30 +3712,14 @@ struct ttg::detail::value_copy_handler { * depending on Value, this uses either the copy or move constructor */ copy = ttg_parsec::detail::create_new_datacopy(value); - bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller); + bool inserted = ttg_parsec::detail::add_copy_to_task(copy, ttg_parsec::detail::parsec_ttg_caller); assert(inserted); - value_ptr = reinterpret_cast(copy->device_private); + value_ptr = reinterpret_cast(copy->get_ptr()); copy_to_remove = copy; } return *value_ptr; } - /* we have to make a copy of non-const data as the user may modify it after - * send/broadcast */ - template >> - inline Value &operator()(Value &value) { - if (nullptr == parsec_ttg_caller) { - ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n"); - } - /* the value is not known, create a copy that we can track */ - ttg_parsec::detail::ttg_data_copy_t *copy; - copy = ttg_parsec::detail::create_new_datacopy(value); - bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller); - assert(inserted); - Value *value_ptr = reinterpret_cast(copy->device_private); - copy_to_remove = copy; - return *value_ptr; - } }; #endif // PARSEC_TTG_H_INCLUDED diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h index 461984e3d2..e97184b287 100644 --- a/ttg/ttg/parsec/ttg_data_copy.h +++ b/ttg/ttg/parsec/ttg_data_copy.h @@ -3,36 +3,108 @@ #include #include +#include +#include +#include +#include #include +#include "ttg/parsec/thread_local.h" +#include "ttg/util/span.h" + namespace ttg_parsec { namespace detail { - /* Extension of PaRSEC's data copy. Note that we use the readers field - * to facilitate the ref-counting of the data copy. - * TODO: create abstractions for all fields in parsec_data_copy_t that we access. - */ - struct ttg_data_copy_t : public parsec_data_copy_t { -#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND) - int64_t size; - int64_t uid; -#endif + template + struct ttg_data_copy_container_setter { + ttg_data_copy_container_setter(T* ptr) { + /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */ + ttg_data_copy_container() = ptr; + } + }; + + /* Non-owning copy-tracking wrapper, accounting for N readers or 1 writer. + * Also counts external references, which are not treated as + * readers or writers but merely prevent the object from being + * destroyed once no readers/writers exist. + */ + struct ttg_data_copy_t : private ttg_data_copy_container_setter { /* special value assigned to parsec_data_copy_t::readers to mark the copy as * mutable, i.e., a task will modify it */ static constexpr int mutable_tag = std::numeric_limits::min(); + ttg_data_copy_t() + : ttg_data_copy_container_setter(this) + { } + + ttg_data_copy_t(const ttg_data_copy_t& c) + : ttg_data_copy_container_setter(this) + { + /* we allow copying but do not copy any data over from the original + * device copies will have to be allocated again + * and it's a new object to reference */ + } + + ttg_data_copy_t(ttg_data_copy_t&& c) + : ttg_data_copy_container_setter(this) + , m_ptr(c.m_ptr) + , m_next_task(c.m_next_task) + , m_readers(c.m_readers) + , m_refs(c.m_refs.load(std::memory_order_relaxed)) + , m_dev_data(std::move(c.m_dev_data)) + , m_single_dev_data(c.m_single_dev_data) + , m_num_dev_data(c.m_num_dev_data) + { + c.m_num_dev_data = 0; + c.m_readers = 0; + c.m_single_dev_data = nullptr; + } + + ttg_data_copy_t& operator=(ttg_data_copy_t&& c) + { + m_ptr = c.m_ptr; + c.m_ptr = nullptr; + m_next_task = c.m_next_task; + c.m_next_task = nullptr; + m_readers = c.m_readers; + c.m_readers = 0; + m_refs.store(c.m_refs.load(std::memory_order_relaxed), std::memory_order_relaxed); + c.m_refs.store(0, std::memory_order_relaxed); + m_dev_data = std::move(c.m_dev_data); + m_single_dev_data = c.m_single_dev_data; + c.m_single_dev_data = nullptr; + m_num_dev_data = c.m_num_dev_data; + c.m_num_dev_data = 0; + /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */ + ttg_data_copy_container() = this; + return *this; + } + + ttg_data_copy_t& operator=(const ttg_data_copy_t& c) { + /* we allow copying but do not copy any data over from the original + * device copies will have to be allocated again + * and it's a new object to reference */ + + /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */ + ttg_data_copy_container() = this; + return *this; + } + + /* mark destructor as virtual */ + virtual ~ttg_data_copy_t() = default; + /* Returns true if the copy is mutable */ bool is_mutable() const { - return this->readers == mutable_tag; + return m_readers == mutable_tag; } /* Mark the copy as mutable */ void mark_mutable() { - this->readers = mutable_tag; + m_readers = mutable_tag; } /* Increment the reader counter and return previous value @@ -41,9 +113,11 @@ namespace ttg_parsec { template int increment_readers() { if constexpr(Atomic) { - return parsec_atomic_fetch_inc_int32(&this->readers); + //return parsec_atomic_fetch_inc_int32(&m_readers); + std::atomic_ref a{m_readers}; + return a.fetch_add(1, std::memory_order_relaxed); } else { - return this->readers++; + return m_readers++; } } @@ -51,7 +125,7 @@ namespace ttg_parsec { * Reset the number of readers to read-only with a single reader. */ void reset_readers() { - this->readers = 1; + m_readers = 1; } /* Decrement the reader counter and return previous value. @@ -60,28 +134,150 @@ namespace ttg_parsec { template int decrement_readers() { if constexpr(Atomic) { - return parsec_atomic_fetch_dec_int32(&this->readers); + //return parsec_atomic_fetch_dec_int32(&m_readers); + std::atomic_ref a{m_readers}; + return a.fetch_sub(1, std::memory_order_relaxed); } else { - return this->readers--; + return m_readers--; } } /* Returns the number of readers if the copy is immutable, or \c mutable_tag * if the copy is mutable */ int num_readers() const { - return this->readers; + return m_readers; } - ttg_data_copy_t() - { - /* TODO: do we need this construction? */ - PARSEC_OBJ_CONSTRUCT(this, parsec_data_copy_t); - this->readers = 1; - this->push_task = nullptr; + void *get_ptr() const { + return m_ptr; } - /* mark destructor as virtual */ - virtual ~ttg_data_copy_t() = default; + parsec_task_t* get_next_task() const { + return m_next_task; + } + + void set_next_task(parsec_task_t* task) { + m_next_task = task; + } + + int32_t add_ref() { + return m_refs.fetch_add(1, std::memory_order_relaxed); + } + + int32_t drop_ref() { + return m_refs.fetch_sub(1, std::memory_order_relaxed); + } + + bool has_ref() { + return (m_refs.load(std::memory_order_relaxed) != 0); + } + + int32_t num_ref() const { + return m_refs.load(std::memory_order_relaxed); + } + + /* manage device copies owned by this object + * we only touch the vector if we have more than one copies to track + * and otherwise use the single-element member. + */ + using iterator = parsec_data_t**; + + void add_device_data(parsec_data_t* data) { + // TODO: properly release again! + PARSEC_OBJ_RETAIN(data); + switch (m_num_dev_data) { + case 0: + m_single_dev_data = data; + break; + case 1: + /* move single copy into vector and add new copy below */ + m_dev_data.push_back(m_single_dev_data); + /* fall-through */ + default: + /* store in multi-copy vector */ + m_dev_data.push_back(data); + break; + } + m_num_dev_data++; + } + + void remove_device_data(parsec_data_t* data) { + if (m_num_dev_data == 1) { + m_single_dev_data = nullptr; + } else if (m_num_dev_data > 1) { + auto it = std::find(m_dev_data.begin(), m_dev_data.end(), data); + if (it != m_dev_data.end()) { + m_dev_data.erase(it); + } + } + --m_num_dev_data; + } + + int num_dev_data() const { + return m_num_dev_data; + } + + iterator begin() { + switch(m_num_dev_data) { + // no device copies + case 0: return end(); + case 1: return &m_single_dev_data; + default: return m_dev_data.data(); + } + } + + iterator end() { + switch(m_num_dev_data) { + case 0: + case 1: + return &(m_single_dev_data) + 1; + default: + return m_dev_data.data() + m_dev_data.size(); + } + } + + using iovec_iterator = typename std::vector::iterator; + + iovec_iterator iovec_begin() { + return m_iovecs.begin(); + } + + iovec_iterator iovec_end() { + return m_iovecs.end(); + } + + void iovec_reset() { + m_iovecs.clear(); + } + + void iovec_add(const ttg::iovec& iov) { + m_iovecs.push_back(iov); + } + + ttg::span iovec_span() { + return ttg::span(m_iovecs.data(), m_iovecs.size()); + } + + std::size_t iovec_count() const { + return m_iovecs.size(); + } + +#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND) + int64_t size; + int64_t uid; +#endif + protected: + void *m_ptr; + parsec_task_t *m_next_task = nullptr; + int32_t m_readers = 1; + std::atomic m_refs = 1; // number of entities referencing this copy (TTGs, external) + + std::vector m_iovecs; + + std::vector m_dev_data; //< used if there are multiple device copies + // that belong to this object + parsec_data_t *m_single_dev_data; //< used if there is a single device copy + int m_num_dev_data = 0; //< number of device copies }; @@ -92,14 +288,57 @@ namespace ttg_parsec { */ template struct ttg_data_value_copy_t final : public ttg_data_copy_t { - using value_type = std::decay_t; + using value_type = ValueT; value_type m_value; template ttg_data_value_copy_t(T&& value) - : ttg_data_copy_t(), m_value(std::forward(value)) + : ttg_data_copy_t() + , m_value(std::forward(value)) + { + this->m_ptr = const_cast(&m_value); + /* reset the container tracker */ + ttg_data_copy_container() = nullptr; + } + + ttg_data_value_copy_t(ttg_data_value_copy_t&& c) + noexcept(std::is_nothrow_move_constructible_v) + : ttg_data_copy_t(std::move(c)) + , m_value(std::move(c.m_value)) { - this->device_private = const_cast(&m_value); + /* reset the container tracker */ + ttg_data_copy_container() = nullptr; + } + + ttg_data_value_copy_t(const ttg_data_value_copy_t& c) + noexcept(std::is_nothrow_copy_constructible_v) + : ttg_data_copy_t(c) + , m_value(c.m_value) + { + /* reset the container tracker */ + ttg_data_copy_container() = nullptr; + } + + ttg_data_value_copy_t& operator=(ttg_data_value_copy_t&& c) + noexcept(std::is_nothrow_move_assignable_v) + { + ttg_data_copy_t::operator=(std::move(c)); + m_value = std::move(c.m_value); + /* reset the container tracker */ + ttg_data_copy_container() = nullptr; + } + + ttg_data_value_copy_t& operator=(const ttg_data_value_copy_t& c) + noexcept(std::is_nothrow_copy_assignable_v) + { + ttg_data_copy_t::operator=(c); + m_value = c.m_value; + /* reset the container tracker */ + ttg_data_copy_container() = nullptr; + } + + value_type& operator*() { + return m_value; } /* will destruct the value */ @@ -110,4 +349,4 @@ namespace ttg_parsec { } // namespace ttg_parsec -#endif // TTG_DATA_COPY_H +#endif // TTG_DATA_COPY_H \ No newline at end of file diff --git a/ttg/ttg/ptr.h b/ttg/ttg/ptr.h new file mode 100644 index 0000000000..3cc699f606 --- /dev/null +++ b/ttg/ttg/ptr.h @@ -0,0 +1,118 @@ +#ifndef TTG_PTR_H +#define TTG_PTR_H + +#include "ttg/impl_selector.h" + +namespace ttg { + +template +using Ptr = TTG_IMPL_NS::ptr; + +template +Ptr make_ptr(Args&&... args) { + return TTG_IMPL_NS::make_ptr(std::forward(args)...); +} + +template +auto get_ptr(T&& obj) { + return TTG_IMPL_NS::get_ptr(std::forward(obj)); +} + +namespace detail { + template + struct is_ptr : std::false_type + { }; + + template + struct is_ptr> : std::true_type + { }; + + template + constexpr bool is_ptr_v = is_ptr::value; + +} // namespace detail + +#if 0 +namespace detail { + + /* awaiter for ttg::get_ptr with multiple arguments + * operator co_wait will return the tuple of ttg::Ptr + */ + template + struct get_ptr_tpl_t { + private: + std::tuple...> m_ptr_tuple; + bool m_is_ready = false; + public: + get_ptr_tpl_t(bool is_ready, std::tuple...>&& ptrs) + : m_ptr_tuple(std::forward...>>(ptrs)) + , m_is_ready(is_ready) + { } + + bool await_ready() const noexcept { + return m_is_ready; + } + + constexpr void await_suspend( std::coroutine_handle<> ) const noexcept { + /* TODO: anything to be done here? */ + } + + auto await_resume() const noexcept { + return std::move(m_ptr_tuple); + } + }; + + + /* awaiter for ttg::get_ptr for a single argument */ + template + struct get_ptr_t { + private: + ttg::Ptr m_ptr; + bool m_is_ready = false; + public: + get_ptr_t(bool is_ready, ttg::Ptr&& ptr) + : m_ptr(std::forward>(ptr)) + , m_is_ready(is_ready) + { } + + bool await_ready() const noexcept { + return m_is_ready; + } + + constexpr void await_suspend( std::coroutine_handle<> ) const noexcept { + /* TODO: anything to be done here? */ + } + + auto await_resume() const noexcept { + return std::move(m_ptr); + } + }; + } // namespace detail + + /** + * Get an awaiter that results in a ttg::Ptr to a task argument. + * Must only be called inside a task on a value that was passed + * to the task and has not yet been moved on. + * Should be used in conjunction with co_await, e.g., + * ttg::Ptr ptr = co_await ttg::get_ptr(val); + * + * Multiple value can be passed, which results in a tuple of ptr: + * ttg::Ptr ptr1, ptr2; + * std::tie(ptr1, ptr2) = co_await ttg::get_ptr(val1, val2); + */ + template + auto get_ptr(Arg&& arg, Args&&... args) { + bool is_ready; + using tpl_type = std::tuple, std::decay...>>; + using result_type = std::pair; + result_type p = TTG_IMPL_NS::get_ptr(std::forward(arg), std::forward(args)...); + if constexpr (sizeof...(Args) > 0) { + return detail::get_ptr_tpl_t, std::decay_t...>(p.first, std::move(p.second)); + } else if constexpr (sizeof...(Args) == 0) { + return detail::get_ptr_t>(p.first, std::move(std::get<0>(p.second))); + } + } +#endif // 0 +} // namespace ttg + +#endif // TTG_PTR_H \ No newline at end of file diff --git a/ttg/ttg/run.h b/ttg/ttg/run.h index 21ec337e87..b4dc593de1 100644 --- a/ttg/ttg/run.h +++ b/ttg/ttg/run.h @@ -57,6 +57,7 @@ namespace ttg { inline void finalize() { TTG_IMPL_NS::ttg_finalize(); } /// Aborts the TTG program using the default backend's `ttg_abort` method + [[noreturn]] inline void abort() { TTG_IMPL_NS::ttg_abort(); } /// Accesses the default backend's default execution context diff --git a/ttg/ttg/serialization/splitmd_data_descriptor.h b/ttg/ttg/serialization/splitmd_data_descriptor.h index 8edfe20d6b..46bdb7b764 100644 --- a/ttg/ttg/serialization/splitmd_data_descriptor.h +++ b/ttg/ttg/serialization/splitmd_data_descriptor.h @@ -3,20 +3,10 @@ #include #include "ttg/util/meta.h" +#include "ttg/util/iovec.h" namespace ttg { - /** - * Used to describe transfer payload in types using the \sa SplitMetadataDescriptor. - * @c data Pointer to the data to be read from / written to. - * @c num_bytes The number of bytes to read from / write to the memory location - * \sa data. - */ - struct iovec { - size_t num_bytes; - void* data; - }; - /** * SplitMetadataDescriptor is a serialization descriptor provided by the user * for a user-specified type. It should contain the following public member diff --git a/ttg/ttg/terminal.h b/ttg/ttg/terminal.h index e1e3e0c739..89b26741d4 100644 --- a/ttg/ttg/terminal.h +++ b/ttg/ttg/terminal.h @@ -155,6 +155,7 @@ namespace ttg { using broadcast_callback_type = meta::detail::broadcast_callback_t>; using setsize_callback_type = typename base_type::setsize_callback_type; using finalize_callback_type = typename base_type::finalize_callback_type; + using prepare_send_callback_type = meta::detail::prepare_send_callback_t>; static constexpr bool is_an_input_terminal = true; ttg::detail::ContainerWrapper container; @@ -162,6 +163,7 @@ namespace ttg { send_callback_type send_callback; move_callback_type move_callback; broadcast_callback_type broadcast_callback; + prepare_send_callback_type prepare_send_callback; // No moving, copying, assigning permitted In(In &&other) = delete; @@ -189,10 +191,12 @@ namespace ttg { void set_callback(const send_callback_type &send_callback, const move_callback_type &move_callback, const broadcast_callback_type &bcast_callback = broadcast_callback_type{}, const setsize_callback_type &setsize_callback = setsize_callback_type{}, - const finalize_callback_type &finalize_callback = finalize_callback_type{}) { + const finalize_callback_type &finalize_callback = finalize_callback_type{}, + const prepare_send_callback_type &prepare_send_callback = prepare_send_callback_type{}) { this->send_callback = send_callback; this->move_callback = move_callback; this->broadcast_callback = bcast_callback; + this->prepare_send_callback = prepare_send_callback; base_type::set_callback(setsize_callback, finalize_callback); } @@ -271,7 +275,7 @@ namespace ttg { for (auto &&key : keylist) send(key, v); } else { /* got something we cannot iterate over (single element?) so put one element in the span */ - broadcast_callback(ttg::span(&keylist, 1), v); + send(ttg::span(&keylist, 1), v); } } } @@ -291,7 +295,36 @@ namespace ttg { for (auto &&key : keylist) sendk(key); } else { /* got something we cannot iterate over (single element?) so put one element in the span */ - broadcast_callback(ttg::span(&keylist, 1)); + sendk(ttg::span(&keylist, 1)); + } + } + } + + + template + void prepare_send(const rangeT &keylist, Value &&value) { + const Value &v = value; + if (prepare_send_callback) { + if constexpr (ttg::meta::is_iterable_v) { + prepare_send_callback(ttg::span(&(*std::begin(keylist)), + std::distance(std::begin(keylist), std::end(keylist))), + v); + } else { + /* got something we cannot iterate over (single element?) so put one element in the span */ + prepare_send_callback(ttg::span(&keylist, 1), v); + } + } + } + + template + void prepare_send(Value &&value) { + const Value &v = value; + if (prepare_send_callback) { + if constexpr (ttg::meta::is_iterable_v) { + prepare_send_callback(v); + } else { + /* got something we cannot iterate over (single element?) so put one element in the span */ + prepare_send_callback(v); } } } @@ -531,6 +564,32 @@ namespace ttg { } } } + + template + std::enable_if_t && !meta::is_void_v, void> + prepare_send(const rangeT &keylist, const Value &value) { + for (auto &&successor : this->successors()) { + assert(successor->get_type() != TerminalBase::Type::Write); + if (successor->get_type() == TerminalBase::Type::Read) { + return static_cast> *>(successor)->prepare_send(keylist, value); + } else if (successor->get_type() == TerminalBase::Type::Consume) { + return static_cast *>(successor)->prepare_send(keylist, value); + } + } + } + + template + std::enable_if_t && !meta::is_void_v, void> + prepare_send(const Value &value) { + for (auto &&successor : this->successors()) { + assert(successor->get_type() != TerminalBase::Type::Write); + if (successor->get_type() == TerminalBase::Type::Read) { + return static_cast> *>(successor)->prepare_send(value); + } else if (successor->get_type() == TerminalBase::Type::Consume) { + return static_cast *>(successor)->prepare_send(value); + } + } + } }; namespace meta { diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h index 66048906a0..a6f3ef9d4d 100644 --- a/ttg/ttg/tt.h +++ b/ttg/ttg/tt.h @@ -1,16 +1,24 @@ #ifndef TTG_TT_H #define TTG_TT_H -#include #include +#include #include "ttg/fwd.h" #include "ttg/base/tt.h" #include "ttg/edge.h" +#ifdef TTG_HAS_COROUTINE +#include "ttg/util/coroutine.h" +#endif + namespace ttg { + // TODO describe TT concept (preferably as a C++20 concept) + // N.B. TT::op returns void or ttg::coroutine_handle<> + // see TTG_PROCESS_TT_OP_RETURN below + /// @brief a template task graph implementation /// It contains (owns) one or more TT objects. Since it can also be viewed as a TT object itself, @@ -35,12 +43,16 @@ namespace ttg { TTG(const TTG &) = delete; TTG &operator=(const TTG &) = delete; // movable - TTG(TTG && other) : TTBase(static_cast(other)), tts(other.tts), ins(std::move(other.ins)), outs(std::move(other.outs)) { - is_ttg_ = true; - own_my_tts(); + TTG(TTG &&other) + : TTBase(static_cast(other)) + , tts(other.tts) + , ins(std::move(other.ins)) + , outs(std::move(other.outs)) { + is_ttg_ = true; + own_my_tts(); } - TTG& operator=(TTG &&other) { - static_cast(*this) = static_cast(other); + TTG &operator=(TTG &&other) { + static_cast(*this) = static_cast(other); is_ttg_ = true; tts = std::move(other.tts); ins = std::move(other.ins); @@ -91,17 +103,14 @@ namespace ttg { void own_my_tts() const { for (auto &op : tts) op->owning_ttg = this; } - }; template - auto make_ttg(ttseqT &&tts, const input_terminalsT &ins, - const output_terminalsT &outs, + auto make_ttg(ttseqT &&tts, const input_terminalsT &ins, const output_terminalsT &outs, const std::string &name = "ttg") { return std::make_unique>(std::forward(tts), ins, outs, name); } - /// A data sink for one input template class SinkTT : public TTBase { @@ -126,20 +135,20 @@ namespace ttg { using valueT = std::decay_t; auto move_callback = [](const keyT &key, valueT &&value) {}; auto send_callback = [](const keyT &key, const valueT &value) {}; - auto broadcast_callback = [](const ttg::span& key, const valueT &value) {}; + auto broadcast_callback = [](const ttg::span &key, const valueT &value) {}; auto setsize_callback = [](const keyT &key, std::size_t size) {}; auto finalize_callback = [](const keyT &key) {}; input.set_callback(send_callback, move_callback, broadcast_callback, setsize_callback, finalize_callback); } - public: - SinkTT(const std::string& inname="junk") : TTBase("sink", numins, numouts) { + public: + SinkTT(const std::string &inname = "junk") : TTBase("sink", numins, numouts) { register_input_terminals(input_terminals, std::vector{inname}); register_input_callback(std::get<0>(input_terminals)); } - SinkTT(const input_edges_type &inedges, const std::string& inname="junk") : TTBase("sink", numins, numouts) { + SinkTT(const input_edges_type &inedges, const std::string &inname = "junk") : TTBase("sink", numins, numouts) { register_input_terminals(input_terminals, std::vector{inname}); register_input_callback(std::get<0>(input_terminals)); std::get<0>(inedges).set_out(&std::get<0>(input_terminals)); @@ -154,12 +163,28 @@ namespace ttg { /// Returns pointer to input terminal i to facilitate connection --- terminal cannot be copied, moved or assigned template std::tuple_element_t *in() { - static_assert(i==0); + static_assert(i == 0); return &std::get(input_terminals); } }; +} // namespace ttg + +#ifndef TTG_PROCESS_TT_OP_RETURN +#ifdef TTG_HAS_COROUTINE +#define TTG_PROCESS_TT_OP_RETURN(result, invoke) \ + { \ + using return_type = decltype(invoke); \ + if constexpr (std::is_same_v) { \ + invoke; \ + } else { \ + auto coro_return = invoke; \ + result = coro_return.address(); \ + } \ + } +#else +#define TTG_PROCESS_TT_OP_RETURN(result, invoke) invoke +#endif +#endif // !defined(TTG_PROCESS_TT_OP_RETURN) -} // namespace ttg - -#endif // TTG_TT_H +#endif // TTG_TT_H diff --git a/ttg/ttg/util/coroutine.h b/ttg/ttg/util/coroutine.h new file mode 100644 index 0000000000..3577032f42 --- /dev/null +++ b/ttg/ttg/util/coroutine.h @@ -0,0 +1,194 @@ +// +// Created by Eduard Valeyev on 10/31/22. +// + +#ifndef TTG_UTIL_COROUTINE_H +#define TTG_UTIL_COROUTINE_H + +#include "ttg/config.h" +#include TTG_CXX_COROUTINE_HEADER + +#include +#include + +namespace ttg { + + struct resumable_task_state; + + // import coroutine_handle, with default promise redefined to resumable_task_state + template + using coroutine_handle = TTG_CXX_COROUTINE_NAMESPACE::coroutine_handle; + + template + struct resumable_task_events; + + /// represents a one-time event + struct event { + void finish() { finished_ = true; } + + /// @return true if the event has occurred + bool finished() const { return finished_; } + + private: + std::atomic finished_ = false; + }; + + /// task that can be resumed after some events occur + struct resumable_task : public ttg::coroutine_handle<> { + using base_type = ttg::coroutine_handle<>; + + /// these are members mandated by the promise_type concept + ///@{ + + using promise_type = struct resumable_task_state; + + ///@} + + resumable_task(base_type base) : base_type(std::move(base)) {} + + base_type handle() { return *this; } + + /// @return true if ready to resume + inline bool ready() const; + + /// @return true if task completed and can be destroyed + inline bool completed() const; + + /// @return ttg::span of events that this task depends on + inline ttg::span events(); + }; + + /// encapsulates the state of the coroutine object visible to the outside world + /// @note this is the `promise_type` for resumable_task coroutine + struct resumable_task_state { + resumable_task_state() noexcept = default; + // these only live on coroutine frames so make noncopyable and nonmovable + resumable_task_state(const resumable_task_state&) = delete; + resumable_task_state& operator=(const resumable_task_state&) = delete; + resumable_task_state(resumable_task_state&&) = delete; + resumable_task_state& operator=(resumable_task_state&&) = delete; + + constexpr static inline std::size_t MaxNumEvents = 20; + using handle_type = coroutine_handle<>; + + /// these are members mandated by the promise_type concept + ///@{ + + resumable_task get_return_object() { return resumable_task{handle_type::from_promise(*this)}; } + + /// @note start task eagerly + TTG_CXX_COROUTINE_NAMESPACE::suspend_never initial_suspend() noexcept { return {}; } + + /// @note suspend task before destroying it so the runtime can know that the task is completed + TTG_CXX_COROUTINE_NAMESPACE::suspend_always final_suspend() noexcept { + completed_ = true; + return {}; + } + void return_void() {} + void unhandled_exception() {} + + ///@} + + /// these are optional members of the promise_type concept + ///@{ + + // these can be used to use optional storage provided by the runtime (e.g. part of the runtime's task data struct) + // N.B. the existing buffer must be passed to operator new via TLS + // void* operator new(std::size_t size) + // { + // return ::operator new(size); + // } + + // N.B. whether the external buffer was used by operator new must be passed via TLS + // void operator delete(void* ptr, std::size_t size) + // { + // ::operator delete(ptr, size); + // } + + ///@} + + /// @return true if ready to resume + constexpr bool ready() const { + for (std::size_t e = 0; e != nevents_; ++e) + if (!events_storage_[e]->finished()) return false; + return true; + } + + /// @return true if the task is completed + constexpr bool completed() const { return completed_; } + + ttg::span events() { return ttg::span(events_storage_.data(), nevents_); } + + private: + std::array events_storage_; + std::size_t nevents_; + bool completed_ = false; + + template + friend struct resumable_task_events; + + void reset_events() { + std::fill(events_storage_.begin(), events_storage_.begin() + nevents_, nullptr); + nevents_ = 0; + } + + template + void set_events(const std::array events) { + static_assert(N <= MaxNumEvents); + std::copy(events.begin(), events.end(), events_storage_.begin()); + nevents_ = N; + } + }; + + bool resumable_task::ready() const { return base_type::promise().ready(); } + bool resumable_task::completed() const { return base_type::promise().completed(); } + ttg::span resumable_task::events() { return base_type::promise().events(); } + + /// statically-sized sequence of events on whose completion progress of a given task depends on + /// @note this is the `Awaiter` for resumable_task coroutine + /// (the concept is not defined in the standard, see + /// https://lewissbaker.github.io/2017/11/17/understanding-operator-co-await instead ) + template + struct resumable_task_events { + private: + template + constexpr bool await_ready(std::index_sequence) const { + return (std::get(events_)->finished() && ...); + } + + public: + template + constexpr resumable_task_events(Events&&... events) : events_{(&events)...} {} + + /// these are members mandated by the Awaiter concept + ///@{ + + constexpr bool await_ready() const { return await_ready(std::make_index_sequence{}); } + + void await_suspend(coroutine_handle<> pending_task) { + pending_task_ = pending_task; + pending_task_.promise().set_events(events_); + } + + void await_resume() { + if (pending_task_) { + pending_task_.promise().reset_events(); + pending_task_ = {}; + } + } + + ///@} + + private: + std::array events_; + coroutine_handle<> pending_task_; + }; // resumable_task_events + + // deduce the number of events properly + template + resumable_task_events(Events&&...) -> resumable_task_events; + + static_assert(resumable_task_events<0>{}.await_ready() == true); +} // namespace ttg + +#endif // TTG_UTIL_COROUTINE_H diff --git a/ttg/ttg/util/iovec.h b/ttg/ttg/util/iovec.h new file mode 100644 index 0000000000..2a8d672b32 --- /dev/null +++ b/ttg/ttg/util/iovec.h @@ -0,0 +1,21 @@ +#ifndef TTG_UTIL_IOVEC_H_ +#define TTG_UTIL_IOVEC_H_ + +#include + +namespace ttg { + + /** + * Used to describe transfer payload in types using the \sa SplitMetadataDescriptor. + * @member data Pointer to the data to be read from / written to. + * @member num_bytes The number of bytes to read from / write to the memory location + * \sa data. + */ + struct iovec { + std::size_t num_bytes; + void* data; + }; + +} // ttg + +#endif // TTG_UTIL_IOVEC_H_ diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h index c197761182..389a9c8d5a 100644 --- a/ttg/ttg/util/meta.h +++ b/ttg/ttg/util/meta.h @@ -549,6 +549,15 @@ namespace ttg { template constexpr bool is_tuple_v = is_tuple::value; + template + struct is_span : std::false_type {}; + + template + struct is_span> : std::true_type {}; + + template + constexpr bool is_span_v = is_span::value; + template