diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index dd7347c254..5d61e82fa7 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -4,7 +4,7 @@ set(TTG_TRACKED_VG_CMAKE_KIT_TAG 878654d0cb1904049fbd2c37b37d5385ae897658) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058) -set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c) +set(TTG_TRACKED_PARSEC_TAG c97e2fc54698d3d937d7847a12c7e9084b22a6c8) set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f) set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815) diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake index b3fd5faa3a..dac240402c 100644 --- a/cmake/modules/FindOrFetchPARSEC.cmake +++ b/cmake/modules/FindOrFetchPARSEC.cmake @@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec) FetchContent_Declare( PARSEC - GIT_REPOSITORY https://github.com/ICLDisco/parsec.git + GIT_REPOSITORY https://github.com/bosilca/parsec.git GIT_TAG ${TTG_TRACKED_PARSEC_TAG} ) FetchContent_MakeAvailable(PARSEC) diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h index 38682ca9df..f1ef7c1e6f 100644 --- a/ttg/ttg/parsec/devicefunc.h +++ b/ttg/ttg/parsec/devicefunc.h @@ -16,7 +16,6 @@ namespace ttg_parsec { parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller; assert(nullptr != caller->dev_ptr); parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task; - parsec_flow_t *flows = caller->dev_ptr->flows; auto& view = std::get(views); bool is_current = false; @@ -38,15 +37,15 @@ namespace ttg_parsec { } /* build the flow */ - /* TODO: reuse the flows of the task class? How can we control the sync direction then? */ - flows[I] = parsec_flow_t{.name = nullptr, + *((parsec_flow_t*)gpu_task->flow_info[I].flow) = + parsec_flow_t{.name = nullptr, .sym_type = PARSEC_SYM_INOUT, .flow_flags = static_cast(access), .flow_index = I, .flow_datatype_mask = ~0 }; - gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes - gpu_task->flow[I] = &flows[I]; + gpu_task->flow_info[I].flow_span = data->span; // size in bytes + gpu_task->flow_info[I].flow_dc = nullptr; /* set the input data copy, parsec will take care of the transfer * and the buffer will look at the parsec_data_t for the current pointer */ @@ -57,13 +56,13 @@ namespace ttg_parsec { } else { /* ignore the flow */ - flows[I] = parsec_flow_t{.name = nullptr, + *((parsec_flow_t*)gpu_task->flow_info[I].flow) = + parsec_flow_t{.name = nullptr, .sym_type = PARSEC_FLOW_ACCESS_NONE, .flow_flags = 0, .flow_index = I, .flow_datatype_mask = ~0 }; - gpu_task->flow[I] = &flows[I]; - gpu_task->flow_nb_elts[I] = 0; // size in bytes + gpu_task->flow_info[I].flow_span = 0; // size in bytes caller->parsec_task.data[I].data_in = nullptr; } @@ -80,6 +79,7 @@ namespace ttg_parsec { template bool register_device_memory(std::tuple &views) { bool is_current = true; + constexpr const std::size_t num_views = sizeof...(Views); if (nullptr == detail::parsec_ttg_caller) { throw std::runtime_error("register_device_memory may only be invoked from inside a task!"); } @@ -88,19 +88,13 @@ namespace ttg_parsec { throw std::runtime_error("register_device_memory called inside a non-gpu task!"); } + auto task = detail::parsec_ttg_caller; + task->dev_ptr->gpu_task->allocate_flows(num_views); + if constexpr (sizeof...(Views) > 0) { is_current = detail::register_device_memory(views, std::index_sequence_for{}); } - /* reset all entries in the current task */ - for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) { - detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr; - detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; - detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i; - detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i]; - detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0; - } - return is_current; } @@ -120,8 +114,8 @@ namespace ttg_parsec { uint8_t i; // only limited number of flows detail::parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller; assert(nullptr != caller->dev_ptr); + caller->dev_ptr->gpu_task->allocate_flows(span.size()); parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task; - parsec_flow_t *flows = caller->dev_ptr->flows; bool is_current = false; for (i = 0; i < span.size(); ++i) { @@ -146,14 +140,15 @@ namespace ttg_parsec { /* build the flow */ /* TODO: reuse the flows of the task class? How can we control the sync direction then? */ - flows[i] = parsec_flow_t{.name = nullptr, + *((parsec_flow_t*)gpu_task->flow_info[i].flow) = + parsec_flow_t{.name = nullptr, .sym_type = PARSEC_SYM_INOUT, .flow_flags = static_cast(access), .flow_index = i, .flow_datatype_mask = ~0 }; - gpu_task->flow_nb_elts[i] = data->nb_elts; // size in bytes - gpu_task->flow[i] = &flows[i]; + gpu_task->flow_info[i].flow_span = data->span; // size in bytes + gpu_task->flow_info[i].flow_dc = nullptr; /* set the input data copy, parsec will take care of the transfer * and the buffer will look at the parsec_data_t for the current pointer */ @@ -164,25 +159,17 @@ namespace ttg_parsec { } else { /* ignore the flow */ - flows[i] = parsec_flow_t{.name = nullptr, + *((parsec_flow_t*)gpu_task->flow_info[i].flow) = + parsec_flow_t{.name = nullptr, .sym_type = PARSEC_FLOW_ACCESS_NONE, .flow_flags = 0, .flow_index = i, .flow_datatype_mask = ~0 }; - gpu_task->flow[i] = &flows[i]; - gpu_task->flow_nb_elts[i] = 0; // size in bytes + gpu_task->flow_info[i].flow_span = 0; // size in bytes caller->parsec_task.data[i].data_in = nullptr; } } - /* reset all remaining entries in the current task */ - for (; i < MAX_PARAM_COUNT; ++i) { - detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr; - detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; - detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i; - detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i]; - detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0; - } // we cannot allow the calling thread to submit kernels so say we're not ready return is_current; } @@ -204,7 +191,7 @@ namespace ttg_parsec { int ret = device_module->memcpy_async(device_module, stream, data->device_copies[0]->device_private, data->device_copies[data->owner_device]->device_private, - data->nb_elts, parsec_device_gpu_transfer_direction_d2h); + data->span, parsec_device_gpu_transfer_direction_d2h); assert(ret == PARSEC_SUCCESS); } if constexpr (sizeof...(Is) > 0) { diff --git a/ttg/ttg/parsec/devicescratch.h b/ttg/ttg/parsec/devicescratch.h index e2c3743aa3..60a12687c9 100644 --- a/ttg/ttg/parsec/devicescratch.h +++ b/ttg/ttg/parsec/devicescratch.h @@ -50,19 +50,6 @@ struct devicescratch { return data; } - void remove_from_flow() { - /* remove the scratch from the gpu-task flow */ - assert(nullptr != detail::parsec_ttg_caller); - parsec_task_t *parsec_task = &detail::parsec_ttg_caller->parsec_task; - parsec_flow_t *flows = detail::parsec_ttg_caller->dev_ptr->flows; - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - if (nullptr != parsec_task->data[i].data_in && parsec_task->data[i].data_in->original == m_data) { - flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; // disable this flow - break; - } - } - } - friend parsec_data_t* detail::get_parsec_data(const ttg_parsec::devicescratch&); public: @@ -93,8 +80,6 @@ struct devicescratch { devicescratch& operator=(const devicescratch& db) = delete; ~devicescratch() { - /* remove data from flow */ - //remove_from_flow(); if (nullptr != m_data) { //parsec_data_destroy(m_data); //parsec_data_copy_detach(m_data, parsec_data_get_copy(m_data, 0), 0); @@ -128,7 +113,7 @@ struct devicescratch { } std::size_t size() const { - return (m_data->nb_elts / sizeof(element_type)); + return (m_data->span / sizeof(element_type)); } }; diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index f29ca8ecb5..2080aaf295 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -10,21 +10,41 @@ namespace ttg_parsec { namespace detail { + struct parsec_ttg_gpu_task_t : public parsec_gpu_task_t { + std::byte *memory = nullptr; + void allocate_flows(std::size_t size) { + if (this->memory != nullptr) free_flows(); + constexpr const auto align = std::align_val_t(std::max(alignof(parsec_flow_t), alignof(parsec_gpu_flow_info_t))); + this->memory = new(align) std::byte[size * (sizeof(parsec_flow_t) + sizeof(parsec_gpu_flow_info_s))]; + parsec_flow_t *flows = (parsec_flow_t*)this->memory; + this->flow_info = (parsec_gpu_flow_info_t*)(this->memory + size * sizeof(parsec_flow_t)); + for (std::size_t i = 0; i < size; ++i) { + this->flow_info[i].flow = &flows[i]; + flows[i].flow_index = i; + flows[i].flow_flags = 0; + flows[i].flow_datatype_mask = ~0; + } + this->nb_flows = size; + } + + void free_flows() { + if (this->memory != nullptr) { + delete[] this->memory; + this->memory = nullptr; + } + } + }; + struct device_ptr_t { - parsec_gpu_task_t* gpu_task = nullptr; - parsec_flow_t* flows = nullptr; + parsec_ttg_gpu_task_t *gpu_task = nullptr; parsec_gpu_exec_stream_t* stream = nullptr; parsec_device_gpu_module_t* device = nullptr; - parsec_task_class_t task_class; // copy of the taskclass }; - template + template struct device_state_t { static constexpr bool support_device = false; - static constexpr size_t num_flows = 0; - device_state_t() - { } static constexpr device_ptr_t* dev_ptr() { return nullptr; } @@ -32,10 +52,10 @@ namespace ttg_parsec { template<> struct device_state_t { - static constexpr bool support_device = false; - static constexpr size_t num_flows = MAX_PARAM_COUNT; - parsec_flow_t m_flows[num_flows]; - device_ptr_t m_dev_ptr = {nullptr, &m_flows[0], nullptr, nullptr}; // gpu_task will be allocated in each task + static constexpr bool support_device = true; + parsec_ttg_gpu_task_t device_task; + device_ptr_t m_dev_ptr = {&device_task, nullptr, nullptr}; + device_ptr_t* dev_ptr() { return &m_dev_ptr; } diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 2814cf3247..597d298007 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1466,10 +1466,10 @@ namespace ttg_parsec { ttg::device::detail::reset_current(); auto discard_tmp_flows = [&](){ - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - if (gpu_task->flow[i]->flow_flags & TTG_PARSEC_FLOW_ACCESS_TMP) { + for (int i = 0; i < gpu_task->nb_flows; ++i) { + if (gpu_task->flow_info[i].flow->flow_flags & TTG_PARSEC_FLOW_ACCESS_TMP) { /* temporary flow, discard by setting it to read-only to avoid evictions */ - const_cast(gpu_task->flow[i])->flow_flags = PARSEC_FLOW_ACCESS_READ; + const_cast(gpu_task->flow_info[i].flow)->flow_flags = PARSEC_FLOW_ACCESS_READ; task->parsec_task.data[i].data_out->readers = 1; } } @@ -1503,87 +1503,59 @@ namespace ttg_parsec { return rc; } + /* callback to set in the device task structure */ + static void release_device_task(parsec_gpu_task_t *gpu_task) { + detail::parsec_ttg_gpu_task_t *task = static_cast(gpu_task); + task->free_flows(); + } + template static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) { task_t *task = (task_t*)parsec_task; - if (task->dev_ptr->gpu_task == nullptr) { - - /* set up a device task */ - parsec_gpu_task_t *gpu_task; - /* PaRSEC wants to free the gpu_task, because F***K ownerships */ - gpu_task = static_cast(std::calloc(1, sizeof(*gpu_task))); - PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); - gpu_task->ec = parsec_task; - gpu_task->task_type = 0; // user task - gpu_task->last_data_check_epoch = std::numeric_limits::max(); // used internally - gpu_task->pushout = 0; - gpu_task->submit = &TT::device_static_submit; - - // one way to force the task device - // currently this will probably break all of PaRSEC if this hint - // does not match where the data is located, not really useful for us - // instead we set a hint on the data if there is no hint set yet - //parsec_task->selected_device = ...; - - /* set the gpu_task so it's available in register_device_memory */ - task->dev_ptr->gpu_task = gpu_task; - /* TODO: is this the right place to set the mask? */ - task->parsec_task.chore_mask = PARSEC_DEV_ALL; + /* set up the device task */ + parsec_gpu_task_t *gpu_task = task->dev_ptr->gpu_task; + /* construct the GPU task */ + PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_gpu_task_t); + gpu_task->ec = parsec_task; + gpu_task->submit = &TT::device_static_submit; + gpu_task->release_device_task = &release_device_task; - /* copy over the task class, because that's what we need */ - task->dev_ptr->task_class = *task->parsec_task.task_class; + /* TODO: is this the right place to set the mask? */ + task->parsec_task.chore_mask = PARSEC_DEV_ALL; - // first invocation of the coroutine to get the coroutine handle - static_op(parsec_task); - /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ + // first invocation of the coroutine to get the coroutine handle + static_op(parsec_task); - parsec_task_class_t& tc = task->dev_ptr->task_class; + /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ - // input flows are set up during register_device_memory as part of the first invocation above - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - tc.in[i] = gpu_task->flow[i]; - tc.out[i] = gpu_task->flow[i]; + /* set the device hint on the data */ + TT *tt = task->tt; + if (tt->devicemap) { + int parsec_dev; + if constexpr (std::is_void_v) { + parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap()); + } else { + parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key)); } - tc.nb_flows = MAX_PARAM_COUNT; - - /* set the device hint on the data */ - TT *tt = task->tt; - if (tt->devicemap) { - int parsec_dev; - if constexpr (std::is_void_v) { - parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap()); - } else { - parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key)); - } - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - /* only set on mutable data since we have exclusive access */ - if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) { - parsec_data_t *data = parsec_task->data[i].data_in->original; - /* only set the preferred device if the host has the latest copy - * as otherwise we may end up with the wrong data if there is a newer - * version on a different device. Also, keep fingers crossed. */ - if (data->owner_device == 0) { - parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); - } + for (int i = 0; i < gpu_task->nb_flows; ++i) { + /* only set on mutable data since we have exclusive access */ + if (gpu_task->flow_info[i].flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) { + parsec_data_t *data = parsec_task->data[i].data_in->original; + /* only set the preferred device if the host has the latest copy + * as otherwise we may end up with the wrong data if there is a newer + * version on a different device. Also, keep fingers crossed. */ + if (data->owner_device == 0) { + parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); } } } - - /* set the new task class that contains the flows */ - task->parsec_task.task_class = &task->dev_ptr->task_class; - - /* select this one */ - return PARSEC_HOOK_RETURN_DONE; } - std::cerr << "EVALUATE called on task with assigned GPU task!" << std::endl; - - /* not sure if this might happen*/ - return PARSEC_HOOK_RETURN_ERROR; - + /* select this one */ + return PARSEC_HOOK_RETURN_DONE; } template @@ -3432,24 +3404,21 @@ namespace ttg_parsec { if (data->owner_device != 0) { /* find the flow */ int flowidx = 0; - while (flowidx < MAX_PARAM_COUNT && - gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) { + while (flowidx < gpu_task->nb_flows) { if (detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in->original == data) { /* found the right data, set the corresponding flow as pushout */ break; } ++flowidx; } - if (flowidx == MAX_PARAM_COUNT) { - throw std::runtime_error("Cannot add more than MAX_PARAM_COUNT flows to a task!"); - } - if (gpu_task->flow[flowidx]->flow_flags == PARSEC_FLOW_ACCESS_NONE) { + assert(flowidx < gpu_task->nb_flows); + if (gpu_task->flow_info[flowidx].flow->flow_flags == PARSEC_FLOW_ACCESS_NONE) { /* no flow found, add one and mark it pushout */ detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in = data->device_copies[0]; - gpu_task->flow_nb_elts[flowidx] = data->nb_elts; + gpu_task->flow_info[flowidx].flow_span = data->span; } - /* need to mark the flow WRITE to convince PaRSEC that the data changed */ - ((parsec_flow_t *)gpu_task->flow[flowidx])->flow_flags |= PARSEC_FLOW_ACCESS_WRITE; + /* need to mark the flow RW to make PaRSEC happy */ + ((parsec_flow_t *)gpu_task->flow_info[flowidx].flow)->flow_flags |= PARSEC_FLOW_ACCESS_WRITE; gpu_task->pushout |= 1<