diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index b408fac581..61115082ae 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -4,7 +4,7 @@ set(TTG_TRACKED_VG_CMAKE_KIT_TAG cda539db32be6e8171f5cbebdb1a7c38d5ab4b34) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058) -set(TTG_TRACKED_PARSEC_TAG 996dda4c0ff3120bc65385f86e999befd4b3fe7a) +set(TTG_TRACKED_PARSEC_TAG parsec-for-ttg) set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f) set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815) diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 62be4270ce..2fdeb5a839 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -72,18 +72,32 @@ namespace detail { PtrT m_ptr; // keep a reference if PtrT is a shared_ptr std::size_t m_size; - void allocate(std::size_t size) { + void do_allocate() { if constexpr (std::is_pointer_v) { - m_ptr = allocator_traits::allocate(m_allocator, size); + m_ptr = allocator_traits::allocate(m_allocator, m_size); } this->device_private = m_ptr; - m_size = size; } - void deallocate() { - allocator_traits::deallocate(m_allocator, static_cast(this->device_private), this->m_size); - this->device_private = nullptr; - this->m_size = 0; + void do_deallocate() { + if constexpr (std::is_pointer_v) { + if (this->device_private != nullptr) { + auto ptr = m_ptr; + this->device_private = nullptr; + this->m_ptr = nullptr; + allocator_traits::deallocate(m_allocator, ptr, this->m_size); + } + } + } + + static void allocate(parsec_data_copy_t *parsec_copy, int device) { + data_copy_type* copy = static_cast(parsec_copy); + copy->do_allocate(); + } + + static void deallocate(parsec_data_copy_t *parsec_copy, int device) { + data_copy_type* copy = static_cast(parsec_copy); + copy->do_deallocate(); } public: @@ -100,20 +114,37 @@ namespace detail { constexpr const bool is_empty_allocator = std::is_same_v>; assert(is_empty_allocator); m_ptr = std::move(ptr); + this->m_size = size; + this->dtt = parsec_datatype_int8_t; this->device_private = const_cast(to_address(m_ptr)); } void construct(std::size_t size, + ttg::scope scope, const allocator_type& alloc = allocator_type()) { constexpr const bool is_empty_allocator = std::is_same_v>; assert(!is_empty_allocator); m_allocator = alloc; - allocate(size); - this->device_private = m_ptr; + this->m_size = size; + this->dtt = parsec_datatype_int8_t; + if (scope == ttg::scope::Allocate) { + /* if the user only requests an allocation on the device + * we don't allocate host memory but provide PaRSEC with + * a way to request host memory from us. */ + this->alloc_cb = &allocate; + this->release_cb = &deallocate; + } else { + /* the user requested that the data be sync'ed into the device + * so we need to provide host memory for the user to fill prior */ + do_allocate(); + this->device_private = m_ptr; + } } ~data_copy_type() { - this->deallocate(); + this->alloc_cb = nullptr; + this->release_cb = nullptr; + this->do_deallocate(); } }; @@ -143,7 +174,7 @@ namespace detail { /* create the host copy and allocate host memory */ data_copy_type *copy = PARSEC_OBJ_NEW(data_copy_type); - copy->construct(size, allocator); + copy->construct(size, scope, allocator); parsec_data_copy_attach(data, copy, 0); /* adjust data flags */ diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h index 38682ca9df..82cc4ac866 100644 --- a/ttg/ttg/parsec/devicefunc.h +++ b/ttg/ttg/parsec/devicefunc.h @@ -201,11 +201,16 @@ namespace ttg_parsec { /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */ if (data->owner_device != 0) { parsec_device_gpu_module_t *device_module = detail::parsec_ttg_caller->dev_ptr->device; + if (nullptr == data->device_copies[0]->device_private) { + assert(nullptr != data->device_copies[0]->alloc_cb); + data->device_copies[0]->alloc_cb(data->device_copies[0], 0); + } + int ret = device_module->memcpy_async(device_module, stream, data->device_copies[0]->device_private, data->device_copies[data->owner_device]->device_private, data->nb_elts, parsec_device_gpu_transfer_direction_d2h); - assert(ret == PARSEC_SUCCESS); + if (ret != PARSEC_SUCCESS) throw std::runtime_error("Failed to copy data from device to host!"); } if constexpr (sizeof...(Is) > 0) { // recursion