From 3b16615966827427ca7d89c2e0341ee570860d0c Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 23 Jan 2024 12:28:54 +0100 Subject: [PATCH 1/7] changes for device mem alloc --- include/dr/mhp/algorithms/sort.hpp | 279 +++++++++++++++++++++-------- test/gtest/mhp/CMakeLists.txt | 3 +- 2 files changed, 207 insertions(+), 75 deletions(-) diff --git a/include/dr/mhp/algorithms/sort.hpp b/include/dr/mhp/algorithms/sort.hpp index 4c9b671908..825757f314 100644 --- a/include/dr/mhp/algorithms/sort.hpp +++ b/include/dr/mhp/algorithms/sort.hpp @@ -26,14 +26,103 @@ namespace dr::mhp { namespace __detail { -template -void local_sort(R &r, Compare &&comp) { +template class buffer { +public: + using value_type = T; + std::size_t size() { return size_; } + T *data() { return data_; } + T *begin() { return data_; } + T *end() { return data_ + size_; } + + T *resize(std::size_t cnt) { + assert(cnt >= size_); + if (cnt == size_) + return data_; + + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + T *newdata = sycl::malloc(cnt, sycl_queue(), sycl_mem_kind()); + assert(newdata != nullptr); + sycl_queue().copy(data_, newdata, size_); + assert(sycl_get(*data_) == + sycl_get(*newdata)); /* surprisingly is helpful in case of mem mgmt + issues */ + sycl::free(data_, sycl_queue()); + data_ = newdata; +#else + assert(false); +#endif + } else { + T *newdata = static_cast(malloc(cnt * sizeof(T))); + memcpy(newdata, data_, size_ * sizeof(T)); + free(data_); + data_ = newdata; + } + size_ = cnt; + return data_; + } + + void replace(buffer &other) { + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + if (data_ != nullptr) + sycl::free(data_, sycl_queue()); +#else + assert(false); +#endif + } else { + if (data_ != nullptr) + free(data_); + } + data_ = rng::data(other); + size_ = rng::size(other); + other.data_ = nullptr; + other.size_ = 0; + } + + ~buffer() { + if (data_ != nullptr) { + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + sycl::free(this->data_, sycl_queue()); +#else + assert(false); +#endif + } else { + free(data_); + } + } + data_ = nullptr; + size_ = 0; + } + + buffer(std::size_t cnt) : size_(cnt) { + if (cnt > 0) { + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + data_ = sycl::malloc(cnt, sycl_queue(), sycl_mem_kind()); +#else + assert(false); +#endif + } else { + data_ = static_cast(malloc(cnt * sizeof(T))); + } + assert(data_ != nullptr); + } + } + +private: + T *data_ = nullptr; + std::size_t size_ = 0; +}; // class buffer + +template void local_sort(R &r, Compare &&comp) { if (rng::size(r) >= 2) { if (mhp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION auto policy = dpl_policy(); auto &&local_segment = dr::ranges::__detail::local(r); - drlog.debug("GPU dpl::sort(), size {}\n", rng::size(r)); + DRLOG("GPU dpl::sort(), size {}\n", rng::size(r)); oneapi::dpl::sort( policy, dr::__detail::direct_iterator(rng::begin(local_segment)), dr::__detail::direct_iterator(rng::end(local_segment)), comp); @@ -41,7 +130,7 @@ void local_sort(R &r, Compare &&comp) { assert(false); #endif } else { - drlog.debug("cpu rng::sort, size {}\n", rng::size(r)); + DRLOG("cpu rng::sort, size {}\n", rng::size(r)); rng::sort(rng::begin(r), rng::end(r), comp); } } @@ -66,85 +155,145 @@ void splitters(Seg &lsegment, Compare &&comp, /* calculate splitting values and indices - find n-1 dividers splitting * each segment into equal parts */ - - for (std::size_t _i = 0; _i < rng::size(vec_lmedians); _i++) { - vec_lmedians[_i] = lsegment[_i * _step_m]; + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + for (std::size_t _i = 0; _i < rng::size(vec_lmedians) - 1; _i++) { + assert(_i * _step_m < rng::size(lsegment)); + sycl_copy(&lsegment[_i * _step_m], &vec_lmedians[_i]); + } + sycl_copy(&lsegment[rng::size(lsegment) - 1], + &vec_lmedians[rng::size(vec_lmedians) - 1]); +#else + assert(false); +#endif + } else { + for (std::size_t _i = 0; _i < rng::size(vec_lmedians) - 1; _i++) { + assert(_i * _step_m < rng::size(lsegment)); + vec_lmedians[_i] = lsegment[_i * _step_m]; + } + vec_lmedians.back() = lsegment.back(); } - vec_lmedians.back() = lsegment.back(); default_comm().all_gather(vec_lmedians, vec_gmedians); - rng::sort(rng::begin(vec_gmedians), rng::end(vec_gmedians), comp); std::vector vec_split_v(_comm_size - 1); for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { + assert((_i + 1) * (_comm_size + 1) - 1 < rng::size(vec_gmedians)); vec_split_v[_i] = vec_gmedians[(_i + 1) * (_comm_size + 1) - 1]; } std::size_t segidx = 0, vidx = 1; - while (vidx < _comm_size && segidx < rng::size(lsegment)) { - if (comp(vec_split_v[vidx - 1], *(lsegment.begin() + segidx))) { - vec_split_i[vidx] = segidx; - vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; - vidx++; - } else { - segidx++; + // auto begin = std::chrono::high_resolution_clock::now(); + + /* TODO: copy and loop below takes most of time of the whole sort procedure; + * move it to the SYCL kernel */ + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + std::vector vec_lseg_tmp(rng::size(lsegment)); + sycl_copy(rng::data(lsegment), rng::data(vec_lseg_tmp), + rng::size(lsegment)); + + while (vidx < _comm_size && segidx < rng::size(lsegment)) { + if (comp(vec_split_v[vidx - 1], vec_lseg_tmp[segidx])) { + vec_split_i[vidx] = segidx; + vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; + vidx++; + } else { + segidx++; + } + } +#else + assert(false); +#endif + } else { + while (vidx < _comm_size && segidx < rng::size(lsegment)) { + if (comp(vec_split_v[vidx - 1], lsegment[segidx])) { + vec_split_i[vidx] = segidx; + vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; + vidx++; + } else { + segidx++; + } } } assert(rng::size(lsegment) > vec_split_i[vidx - 1]); vec_split_s[vidx - 1] = rng::size(lsegment) - vec_split_i[vidx - 1]; + + // auto end = std::chrono::high_resolution_clock::now(); + // fmt::print("{}: splitters 3 duration {} ms\n", default_comm().rank(), + // std::chrono::duration(end - begin).count() * 1000); } -template +template void shift_data(const int shift_left, const int shift_right, - std::vector &vec_recvdata, - std::vector &vec_left, - std::vector &vec_right) { + buffer &vec_recvdata, buffer &vec_left, + buffer &vec_right) { const std::size_t _comm_rank = default_comm().rank(); MPI_Request req_l, req_r; MPI_Status stat_l, stat_r; + assert(static_cast(rng::size(vec_left)) == std::max(0, shift_left)); + assert(static_cast(rng::size(vec_right)) == std::max(0, shift_right)); + if (static_cast(rng::size(vec_recvdata)) < -shift_left) { - // Too little data in recv buffer to shift left - first get from right, then - // send left - drlog.debug("Get from right first, recvdata size {} shl {}\n", - rng::size(vec_recvdata), -shift_left); + // Too little data in recv buffer to shift left - first get from right, + // then send left + DRLOG("Get from right first, recvdata size {} shift left {} \n", + rng::size(vec_recvdata), shift_left); // ** This will never happen, because values eq to split go left ** assert(false); } else if (static_cast(rng::size(vec_recvdata)) < -shift_right) { - // Too little data in buffer to shift right - first get from left, then send - // right + // Too little data in buffer to shift right - first get from left, then + // send right assert(shift_left > 0); - default_comm().irecv(vec_left, _comm_rank - 1, &req_l); + + default_comm().irecv(rng::data(vec_left), rng::size(vec_left), + _comm_rank - 1, &req_l); MPI_Wait(&req_l, &stat_l); - vec_left.insert(vec_left.end(), vec_recvdata.begin(), vec_recvdata.end()); - std::swap(vec_left, vec_recvdata); - vec_left.clear(); + vec_left.resize(shift_left + rng::size(vec_recvdata)); - default_comm().isend((valT *)(vec_recvdata.data()) + - rng::size(vec_recvdata) + shift_right, - -shift_right, _comm_rank + 1, &req_r); + if (mhp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + sycl_queue().copy(rng::data(vec_recvdata), + rng::data(vec_left) + shift_left, + rng::size(vec_recvdata)); +#else + assert(false); +#endif + } else { + memcpy(rng::data(vec_left) + shift_left, rng::data(vec_recvdata), + rng::size(vec_recvdata)); + } + vec_recvdata.replace(vec_left); + + default_comm().isend(vec_recvdata.end() + shift_right, -shift_right, + _comm_rank + 1, &req_r); MPI_Wait(&req_r, &stat_r); } else { // enough data in recv buffer if (shift_left < 0) { - default_comm().isend(vec_recvdata.data(), -shift_left, _comm_rank - 1, + default_comm().isend(rng::data(vec_recvdata), -shift_left, _comm_rank - 1, &req_l); } else if (shift_left > 0) { - default_comm().irecv(vec_left, _comm_rank - 1, &req_l); + assert(shift_left == static_cast(rng::size(vec_left))); + default_comm().irecv(rng::data(vec_left), rng::size(vec_left), + _comm_rank - 1, &req_l); } if (shift_right > 0) { - default_comm().irecv(vec_right, _comm_rank + 1, &req_r); + assert(shift_right == static_cast(rng::size(vec_right))); + default_comm().irecv(rng::data(vec_right), rng::size(vec_right), + _comm_rank + 1, &req_r); } else if (shift_right < 0) { - default_comm().isend((valT *)(vec_recvdata.data()) + - rng::size(vec_recvdata) + shift_right, + default_comm().isend(rng::data(vec_recvdata) + rng::size(vec_recvdata) + + shift_right, -shift_right, _comm_rank + 1, &req_r); } @@ -155,11 +304,10 @@ void shift_data(const int shift_left, const int shift_right, } } -template +template void copy_results(auto &lsegment, const int shift_left, const int shift_right, - std::vector &vec_recvdata, - std::vector &vec_left, - std::vector &vec_right) { + buffer &vec_recvdata, buffer &vec_left, + buffer &vec_right) { const std::size_t invalidate_left = std::max(-shift_left, 0); const std::size_t invalidate_right = std::max(-shift_right, 0); @@ -173,12 +321,12 @@ void copy_results(auto &lsegment, const int shift_left, const int shift_right, sycl::event e_l, e_d, e_r; if (size_l > 0) - e_l = sycl_queue().copy(vec_left.data(), lsegment.data(), size_l); + e_l = sycl_queue().copy(rng::data(vec_left), rng::data(lsegment), size_l); if (size_r > 0) - e_r = sycl_queue().copy(vec_right.data(), - lsegment.data() + size_l + size_d, size_r); - e_d = sycl_queue().copy(vec_recvdata.data() + invalidate_left, - lsegment.data() + size_l, size_d); + e_r = sycl_queue().copy(rng::data(vec_right), + rng::data(lsegment) + size_l + size_d, size_r); + e_d = sycl_queue().copy(rng::data(vec_recvdata) + invalidate_left, + rng::data(lsegment) + size_l, size_d); if (size_l > 0) e_l.wait(); if (size_r > 0) @@ -190,12 +338,14 @@ void copy_results(auto &lsegment, const int shift_left, const int shift_right, #endif } else { if (size_l > 0) - std::memcpy(lsegment.data(), vec_left.data(), size_l * sizeof(valT)); + std::memcpy(rng::data(lsegment), rng::data(vec_left), + size_l * sizeof(valT)); if (size_r > 0) - std::memcpy(lsegment.data() + size_l + size_d, vec_right.data(), + std::memcpy(rng::data(lsegment) + size_l + size_d, rng::data(vec_right), size_r * sizeof(valT)); - std::memcpy(lsegment.data() + size_l, vec_recvdata.data() + invalidate_left, + std::memcpy(rng::data(lsegment) + size_l, + rng::data(vec_recvdata) + invalidate_left, size_d * sizeof(valT)); } } @@ -208,11 +358,6 @@ void dist_sort(R &r, Compare &&comp) { const std::size_t _comm_rank = default_comm().rank(); const std::size_t _comm_size = default_comm().size(); // dr-style ignore -#ifdef SYCL_LANGUAGE_VERSION - auto policy = dpl_policy(); - sycl::usm_allocator alloc(policy.queue()); -#endif - auto &&lsegment = local_segment(r); std::vector vec_split_i(_comm_size, 0); @@ -226,13 +371,11 @@ void dist_sort(R &r, Compare &&comp) { /* find splitting values - limits of areas to send to other processes */ __detail::splitters(lsegment, comp, vec_split_i, vec_split_s); - default_comm().alltoall(vec_split_s, vec_rsizes, 1); /* prepare data to send and receive */ std::exclusive_scan(vec_rsizes.begin(), vec_rsizes.end(), vec_rindices.begin(), 0); - const std::size_t _recv_elems = vec_rindices.back() + vec_rsizes.back(); /* send and receive data belonging to each node, then redistribute @@ -242,11 +385,7 @@ void dist_sort(R &r, Compare &&comp) { default_comm().i_all_gather(_recv_elems, vec_recv_elems, &req_recvelems); /* buffer for received data */ -#ifdef SYCL_LANGUAGE_VERSION - std::vector vec_recvdata(_recv_elems, alloc); -#else - std::vector vec_recvdata(_recv_elems); -#endif + buffer vec_recvdata(_recv_elems); /* send data not belonging and receive data belonging to local processes */ @@ -275,13 +414,8 @@ void dist_sort(R &r, Compare &&comp) { const int shift_right = _comm_rank == _comm_size - 1 ? 0 : vec_shift[_comm_rank]; -#ifdef SYCL_LANGUAGE_VERSION - std::vector vec_left(std::max(shift_left, 0), alloc); - std::vector vec_right(std::max(shift_right, 0), alloc); -#else - std::vector vec_left(std::max(shift_left, 0)); - std::vector vec_right(std::max(shift_right, 0)); -#endif + buffer vec_left(std::max(shift_left, 0)); + buffer vec_right(std::max(shift_right, 0)); /* shift data if necessary, to have exactly the number of elements equal to * lsegment size */ @@ -291,6 +425,7 @@ void dist_sort(R &r, Compare &&comp) { /* copy results to distributed vector's local segment */ __detail::copy_results(lsegment, shift_left, shift_right, vec_recvdata, vec_left, vec_right); + } // __detail::dist_sort } // namespace __detail @@ -304,8 +439,6 @@ void sort(R &r, Compare &&comp = Compare()) { std::size_t _comm_size = default_comm().size(); // dr-style ignore if (_comm_size == 1) { - drlog.debug("mhp::sort() - single node\n"); - auto &&lsegment = local_segment(r); __detail::local_sort(lsegment, comp); @@ -313,7 +446,7 @@ void sort(R &r, Compare &&comp = Compare()) { /* Distributed vector of size <= (comm_size-1) * (comm_size-1) may have * 0-size local segments. It is also small enough to prefer sequential sort */ - drlog.debug("mhp::sort() - local sort\n"); + DRLOG("mhp::sort() - local sort\n"); std::vector vec_recvdata(rng::size(r)); dr::mhp::copy(0, r, rng::begin(vec_recvdata)); @@ -325,7 +458,7 @@ void sort(R &r, Compare &&comp = Compare()) { dr::mhp::copy(0, vec_recvdata, rng::begin(r)); } else { - drlog.debug("mhp::sort() - dist sort\n"); + DRLOG("mhp::sort() - distributed sort\n"); __detail::dist_sort(r, comp); dr::mhp::barrier(); } diff --git a/test/gtest/mhp/CMakeLists.txt b/test/gtest/mhp/CMakeLists.txt index 181dba0755..3dbebd7f7f 100644 --- a/test/gtest/mhp/CMakeLists.txt +++ b/test/gtest/mhp/CMakeLists.txt @@ -99,8 +99,7 @@ if(ENABLE_SYCL) add_mhp_ctest(NAME mhp-quick-test NPROC 1 SYCL) add_mhp_ctest(NAME mhp-quick-test NPROC 2 SYCL) - # DRA-17 for fixing sort, DRA-18 for fixing ExclScan - set(sycl-offload-exclusions ${sycl-exclusions}:Sort*) + set(sycl-offload-exclusions ${sycl-exclusions}) add_mhp_ctest( NAME mhp-tests NPROC 2 OFFLOAD SYCL TARGS --device-memory From 0075ef8450de8c55b432eb51be42fe4c6c508edb Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Thu, 25 Jan 2024 14:41:59 +0100 Subject: [PATCH 2/7] sycl kernel and buffers --- benchmarks/gbench/mhp/CMakeLists.txt | 8 +- include/dr/mhp/algorithms/sort.hpp | 159 ++++++++++++++++++++------- include/dr/mhp/sycl_support.hpp | 5 + test/gtest/mhp/CMakeLists.txt | 2 +- 4 files changed, 132 insertions(+), 42 deletions(-) diff --git a/benchmarks/gbench/mhp/CMakeLists.txt b/benchmarks/gbench/mhp/CMakeLists.txt index 6de22c4ca3..4d9d45e881 100644 --- a/benchmarks/gbench/mhp/CMakeLists.txt +++ b/benchmarks/gbench/mhp/CMakeLists.txt @@ -36,7 +36,7 @@ endif() # builds much faster. Change the source files to match what you need to test. It # is OK to commit changes to the source file list. add_executable(mhp-quick-bench mhp-bench.cpp - ../common/inclusive_exclusive_scan.cpp) + ../common/sort.cpp) foreach(mhp-bench-exec IN ITEMS mhp-bench mhp-quick-bench) target_compile_definitions(${mhp-bench-exec} PRIVATE BENCH_MHP) @@ -47,9 +47,9 @@ foreach(mhp-bench-exec IN ITEMS mhp-bench mhp-quick-bench) endif() endforeach() -if(ENABLE_SYCL) - target_sources(mhp-quick-bench PRIVATE fft3d.cpp) -endif() +# if(ENABLE_SYCL) +# target_sources(mhp-quick-bench PRIVATE fft3d.cpp) +# endif() cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) diff --git a/include/dr/mhp/algorithms/sort.hpp b/include/dr/mhp/algorithms/sort.hpp index 825757f314..a86423172d 100644 --- a/include/dr/mhp/algorithms/sort.hpp +++ b/include/dr/mhp/algorithms/sort.hpp @@ -31,6 +31,7 @@ template class buffer { using value_type = T; std::size_t size() { return size_; } T *data() { return data_; } + T *cdata() const { return data_; } T *begin() { return data_; } T *end() { return data_ + size_; } @@ -177,54 +178,127 @@ void splitters(Seg &lsegment, Compare &&comp, default_comm().all_gather(vec_lmedians, vec_gmedians); rng::sort(rng::begin(vec_gmedians), rng::end(vec_gmedians), comp); - std::vector vec_split_v(_comm_size - 1); + fmt::print("{}:{} problem start\n", default_comm().rank(), __LINE__); - for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { - assert((_i + 1) * (_comm_size + 1) - 1 < rng::size(vec_gmedians)); - vec_split_v[_i] = vec_gmedians[(_i + 1) * (_comm_size + 1) - 1]; - } - - std::size_t segidx = 0, vidx = 1; - - // auto begin = std::chrono::high_resolution_clock::now(); +#ifdef SYCL_LANGUAGE_VERSION + +#else + +#endif - /* TODO: copy and loop below takes most of time of the whole sort procedure; - * move it to the SYCL kernel */ +// for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { +// assert((_i + 1) * (_comm_size + 1) - 1 < rng::size(vec_gmedians)); +// #ifdef SYCL_LANGUAGE_VERSION +// sycl_copy(&vec_gmedians[(_i + 1) * (_comm_size + 1) - 1], vec_split_v.data() + _i); +// #else +// vec_split_v[_i] = vec_gmedians[(_i + 1) * (_comm_size + 1) - 1]; +// #endif +// } + + + auto begin = std::chrono::high_resolution_clock::now(); + auto end = std::chrono::high_resolution_clock::now(); + + /* TODO: copy and loop below takes most of time of the whole sort + * procedure; move it to the SYCL kernel */ if (mhp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION - std::vector vec_lseg_tmp(rng::size(lsegment)); - sycl_copy(rng::data(lsegment), rng::data(vec_lseg_tmp), - rng::size(lsegment)); - - while (vidx < _comm_size && segidx < rng::size(lsegment)) { - if (comp(vec_split_v[vidx - 1], vec_lseg_tmp[segidx])) { - vec_split_i[vidx] = segidx; - vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; - vidx++; - } else { - segidx++; - } + fmt::print("{}:{} sycl start\n", default_comm().rank(), __LINE__); + + __detail::buffer dbuf_v(_comm_size - 1); + __detail::buffer dbuf_i(rng::size(vec_split_i)); + __detail::buffer dbuf_s(rng::size(vec_split_s)); + + for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { + assert((_i + 1) * (_comm_size + 1) - 1 < rng::size(vec_gmedians)); + sycl_copy(&vec_gmedians[(_i + 1) * (_comm_size + 1) - 1], + dbuf_v.data() + _i); + } + + sycl_copy(vec_split_i.data(), dbuf_i.data(), + vec_split_i.size()); + sycl_copy(vec_split_s.data(), dbuf_s.data(), + vec_split_s.size()); + + sycl::buffer buf_i(dbuf_i.data(), sycl::range{dbuf_i.size()}); + sycl::buffer buf_s(dbuf_s.data(), sycl::range{dbuf_s.size()}); + sycl::buffer buf_v(dbuf_v.data(), sycl::range(rng::size(dbuf_v))); + sycl::buffer buf_lsegment(lsegment); + + // end = std::chrono::high_resolution_clock::now(); + // fmt::print("{}: splitters 0 duration {} ms\n", default_comm().rank(), + // std::chrono::duration(end - begin).count() * 1000); + begin = std::chrono::high_resolution_clock::now(); + + sycl_queue() + .submit([&](sycl::handler &h) { + sycl::accessor acc_i{buf_i, h, sycl::read_write}; + sycl::accessor acc_s{buf_s, h, sycl::write_only}; + sycl::accessor acc_v{buf_v, h, sycl::read_write}; + sycl::accessor acc_ls{buf_lsegment, h, sycl::read_only}; + + auto ls_size = rng::size(lsegment); + h.single_task([=]() { + sycl::opencl::cl_ulong vidx0 = 0, vidx1 = 1; + sycl::opencl::cl_ulong segidx = 0; + while (vidx1 < _comm_size && segidx < ls_size) { + if (comp(acc_v[vidx1 - 1], acc_ls[segidx])) { + acc_i[vidx1] = segidx; + acc_s[vidx0] = acc_i[vidx1] - acc_i[vidx0]; + vidx1++; + vidx0++; + } else { + segidx++; + } + } + acc_s[vidx0] = ls_size - acc_i[vidx0]; + }); + }) + .wait(); + + end = std::chrono::high_resolution_clock::now(); + fmt::print("{}: splitters 2 duration {} ms\n", default_comm().rank(), + std::chrono::duration(end - begin).count() * 1000); + begin = std::chrono::high_resolution_clock::now(); + + sycl::host_accessor res_i{buf_i}, res_s{buf_s}; + + for (int _i = 0; _i < rng::size(vec_split_i); _i++) { + vec_split_i[_i] = sycl_get(res_i[_i]); + vec_split_s[_i] = sycl_get(res_s[_i]); } + + // fmt::print("{}:{} after kernel s {} i {}\n", default_comm().rank(), + // __LINE__, vec_split_s, vec_split_i); #else assert(false); #endif - } else { - while (vidx < _comm_size && segidx < rng::size(lsegment)) { - if (comp(vec_split_v[vidx - 1], lsegment[segidx])) { - vec_split_i[vidx] = segidx; - vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; - vidx++; - } else { - segidx++; + } + else { + std::vector vec_split_v(_comm_size - 1); + for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { + assert((_i + 1) * (_comm_size + 1) - 1 < rng::size(vec_gmedians)); + vec_split_v[_i] = vec_gmedians[(_i + 1) * (_comm_size + 1) - 1]; + } + + std::size_t segidx = 0, vidx = 1; + while (vidx < _comm_size && segidx < rng::size(lsegment)) { + if (comp(vec_split_v.data()[vidx - 1], lsegment[segidx])) { + vec_split_i[vidx] = segidx; + vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; + vidx++; + } else { + segidx++; + } } + assert(rng::size(lsegment) > vec_split_i[vidx - 1]); + vec_split_s[vidx - 1] = rng::size(lsegment) - vec_split_i[vidx - 1]; } - } - assert(rng::size(lsegment) > vec_split_i[vidx - 1]); - vec_split_s[vidx - 1] = rng::size(lsegment) - vec_split_i[vidx - 1]; - // auto end = std::chrono::high_resolution_clock::now(); - // fmt::print("{}: splitters 3 duration {} ms\n", default_comm().rank(), - // std::chrono::duration(end - begin).count() * 1000); + end = std::chrono::high_resolution_clock::now(); + fmt::print("{}: splitters 3 duration {} ms\n", default_comm().rank(), + std::chrono::duration(end - begin).count() * 1000); + fmt::print("{}:{} splitters done\n", default_comm().rank(), __LINE__); } template @@ -419,10 +493,14 @@ void dist_sort(R &r, Compare &&comp) { /* shift data if necessary, to have exactly the number of elements equal to * lsegment size */ + // fmt::print("{}:{} before shift\n", default_comm().rank(), __LINE__); + __detail::shift_data(shift_left, shift_right, vec_recvdata, vec_left, vec_right); /* copy results to distributed vector's local segment */ + // fmt::print("{}:{} before copy\n", default_comm().rank(), __LINE__); + __detail::copy_results(lsegment, shift_left, shift_right, vec_recvdata, vec_left, vec_right); @@ -470,3 +548,10 @@ void sort(RandomIt first, RandomIt last, Compare comp = Compare()) { } } // namespace dr::mhp + +template +struct sycl::is_device_copyable> : sycl::is_device_copyable {}; + +template <> +struct sycl::is_device_copyable> + : sycl::is_device_copyable {}; diff --git a/include/dr/mhp/sycl_support.hpp b/include/dr/mhp/sycl_support.hpp index 528f64ccb8..21406e6432 100644 --- a/include/dr/mhp/sycl_support.hpp +++ b/include/dr/mhp/sycl_support.hpp @@ -94,4 +94,9 @@ template void sycl_copy(T *src, T *dst, std::size_t size = 1) { sycl_copy(src, src + size, dst); } +template void sycl_copy(const T *src, const T *dst, std::size_t size = 1) { + sycl_copy(src, src + size, dst); +} + + } // namespace dr::mhp::__detail diff --git a/test/gtest/mhp/CMakeLists.txt b/test/gtest/mhp/CMakeLists.txt index 3dbebd7f7f..bd270a6973 100644 --- a/test/gtest/mhp/CMakeLists.txt +++ b/test/gtest/mhp/CMakeLists.txt @@ -56,7 +56,7 @@ add_executable( add_executable(mhp-quick-test mhp-tests.cpp - ../common/inclusive_scan.cpp + ../common/sort.cpp ) # cmake-format: on From 4bd4da80123825a5e2ec74cb08a67461185ad88f Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Thu, 25 Jan 2024 16:49:40 +0100 Subject: [PATCH 3/7] messages --- benchmarks/gbench/mhp/CMakeLists.txt | 7 +- include/dr/mhp/algorithms/sort.hpp | 116 +++++++++++++-------------- include/dr/mhp/sycl_support.hpp | 4 +- 3 files changed, 61 insertions(+), 66 deletions(-) diff --git a/benchmarks/gbench/mhp/CMakeLists.txt b/benchmarks/gbench/mhp/CMakeLists.txt index 4d9d45e881..294a862eaf 100644 --- a/benchmarks/gbench/mhp/CMakeLists.txt +++ b/benchmarks/gbench/mhp/CMakeLists.txt @@ -35,8 +35,7 @@ endif() # mhp-quick-bench is for development. By reducing the number of source files, it # builds much faster. Change the source files to match what you need to test. It # is OK to commit changes to the source file list. -add_executable(mhp-quick-bench mhp-bench.cpp - ../common/sort.cpp) +add_executable(mhp-quick-bench mhp-bench.cpp ../common/sort.cpp) foreach(mhp-bench-exec IN ITEMS mhp-bench mhp-quick-bench) target_compile_definitions(${mhp-bench-exec} PRIVATE BENCH_MHP) @@ -47,9 +46,7 @@ foreach(mhp-bench-exec IN ITEMS mhp-bench mhp-quick-bench) endif() endforeach() -# if(ENABLE_SYCL) -# target_sources(mhp-quick-bench PRIVATE fft3d.cpp) -# endif() +# if(ENABLE_SYCL) target_sources(mhp-quick-bench PRIVATE fft3d.cpp) endif() cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) diff --git a/include/dr/mhp/algorithms/sort.hpp b/include/dr/mhp/algorithms/sort.hpp index a86423172d..71c91cd011 100644 --- a/include/dr/mhp/algorithms/sort.hpp +++ b/include/dr/mhp/algorithms/sort.hpp @@ -142,7 +142,7 @@ template void splitters(Seg &lsegment, Compare &&comp, std::vector &vec_split_i, std::vector &vec_split_s) { - + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); const std::size_t _comm_size = default_comm().size(); // dr-style ignore assert(rng::size(vec_split_i) == _comm_size); @@ -158,12 +158,14 @@ void splitters(Seg &lsegment, Compare &&comp, * each segment into equal parts */ if (mhp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); for (std::size_t _i = 0; _i < rng::size(vec_lmedians) - 1; _i++) { assert(_i * _step_m < rng::size(lsegment)); sycl_copy(&lsegment[_i * _step_m], &vec_lmedians[_i]); } sycl_copy(&lsegment[rng::size(lsegment) - 1], &vec_lmedians[rng::size(vec_lmedians) - 1]); + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); #else assert(false); #endif @@ -180,27 +182,11 @@ void splitters(Seg &lsegment, Compare &&comp, fmt::print("{}:{} problem start\n", default_comm().rank(), __LINE__); -#ifdef SYCL_LANGUAGE_VERSION - -#else - -#endif - -// for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { -// assert((_i + 1) * (_comm_size + 1) - 1 < rng::size(vec_gmedians)); -// #ifdef SYCL_LANGUAGE_VERSION -// sycl_copy(&vec_gmedians[(_i + 1) * (_comm_size + 1) - 1], vec_split_v.data() + _i); -// #else -// vec_split_v[_i] = vec_gmedians[(_i + 1) * (_comm_size + 1) - 1]; -// #endif -// } - - auto begin = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now(); - /* TODO: copy and loop below takes most of time of the whole sort - * procedure; move it to the SYCL kernel */ + /* TODO: the loop below takes most of time of the whole sort + * procedure; is optimisation possible? */ if (mhp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION fmt::print("{}:{} sycl start\n", default_comm().rank(), __LINE__); @@ -216,20 +202,17 @@ void splitters(Seg &lsegment, Compare &&comp, } sycl_copy(vec_split_i.data(), dbuf_i.data(), - vec_split_i.size()); + vec_split_i.size()); sycl_copy(vec_split_s.data(), dbuf_s.data(), - vec_split_s.size()); + vec_split_s.size()); sycl::buffer buf_i(dbuf_i.data(), sycl::range{dbuf_i.size()}); sycl::buffer buf_s(dbuf_s.data(), sycl::range{dbuf_s.size()}); sycl::buffer buf_v(dbuf_v.data(), sycl::range(rng::size(dbuf_v))); sycl::buffer buf_lsegment(lsegment); - // end = std::chrono::high_resolution_clock::now(); - // fmt::print("{}: splitters 0 duration {} ms\n", default_comm().rank(), - // std::chrono::duration(end - begin).count() * 1000); begin = std::chrono::high_resolution_clock::now(); - + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); sycl_queue() .submit([&](sycl::handler &h) { sycl::accessor acc_i{buf_i, h, sycl::read_write}; @@ -239,8 +222,8 @@ void splitters(Seg &lsegment, Compare &&comp, auto ls_size = rng::size(lsegment); h.single_task([=]() { - sycl::opencl::cl_ulong vidx0 = 0, vidx1 = 1; - sycl::opencl::cl_ulong segidx = 0; + sycl::opencl::cl_ulong vidx0 = 0, vidx1 = 1, segidx = 0; + while (vidx1 < _comm_size && segidx < ls_size) { if (comp(acc_v[vidx1 - 1], acc_ls[segidx])) { acc_i[vidx1] = segidx; @@ -258,7 +241,7 @@ void splitters(Seg &lsegment, Compare &&comp, end = std::chrono::high_resolution_clock::now(); fmt::print("{}: splitters 2 duration {} ms\n", default_comm().rank(), - std::chrono::duration(end - begin).count() * 1000); + std::chrono::duration(end - begin).count() * 1000); begin = std::chrono::high_resolution_clock::now(); sycl::host_accessor res_i{buf_i}, res_s{buf_s}; @@ -273,27 +256,45 @@ void splitters(Seg &lsegment, Compare &&comp, #else assert(false); #endif + } else { + fmt::print("{}:{} NO SYCL splitters\n", default_comm().rank(),__LINE__); + + std::vector vec_split_v(_comm_size - 1); + for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { + assert((_i + 1) * (_comm_size + 1) - 1 < rng::size(vec_gmedians)); + vec_split_v[_i] = vec_gmedians[(_i + 1) * (_comm_size + 1) - 1]; } - else { - std::vector vec_split_v(_comm_size - 1); - for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { - assert((_i + 1) * (_comm_size + 1) - 1 < rng::size(vec_gmedians)); - vec_split_v[_i] = vec_gmedians[(_i + 1) * (_comm_size + 1) - 1]; - } - std::size_t segidx = 0, vidx = 1; - while (vidx < _comm_size && segidx < rng::size(lsegment)) { - if (comp(vec_split_v.data()[vidx - 1], lsegment[segidx])) { - vec_split_i[vidx] = segidx; - vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; - vidx++; - } else { - segidx++; - } + /* + std::size_t segidx = 0, vidx = 1; + while (vidx < _comm_size && segidx < rng::size(lsegment)) { + if (comp(vec_split_v.data()[vidx - 1], lsegment[segidx])) { + vec_split_i[vidx] = segidx; + vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; + vidx++; + } else { + segidx++; + } + } + */ + std::size_t vidx = 1; + for (std::size_t sidx = 0; sidx < rng::size(lsegment); sidx++) { + while (comp(vec_split_v[vidx - 1], lsegment[sidx])) { + vec_split_i[vidx] = sidx; + vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; + vidx++; + if (vidx == _comm_size) + break; } - assert(rng::size(lsegment) > vec_split_i[vidx - 1]); - vec_split_s[vidx - 1] = rng::size(lsegment) - vec_split_i[vidx - 1]; + if (vidx == _comm_size) + break; } + assert(rng::size(lsegment) > vec_split_i[vidx - 1]); + vec_split_s[vidx - 1] = rng::size(lsegment) - vec_split_i[vidx - 1]; + } + + fmt::print("{}:{} after split s {} i {}\n", default_comm().rank(), + __LINE__, vec_split_s, vec_split_i); end = std::chrono::high_resolution_clock::now(); fmt::print("{}: splitters 3 duration {} ms\n", default_comm().rank(), @@ -444,7 +445,9 @@ void dist_sort(R &r, Compare &&comp) { __detail::local_sort(lsegment, comp); /* find splitting values - limits of areas to send to other processes */ + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); __detail::splitters(lsegment, comp, vec_split_i, vec_split_s); + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); default_comm().alltoall(vec_split_s, vec_rsizes, 1); /* prepare data to send and receive */ @@ -456,8 +459,10 @@ void dist_sort(R &r, Compare &&comp) { * data to achieve size of data equal to size of local segment */ MPI_Request req_recvelems; + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); default_comm().i_all_gather(_recv_elems, vec_recv_elems, &req_recvelems); + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); /* buffer for received data */ buffer vec_recvdata(_recv_elems); @@ -465,20 +470,20 @@ void dist_sort(R &r, Compare &&comp) { */ default_comm().alltoallv(lsegment, vec_split_s, vec_split_i, vec_recvdata, vec_rsizes, vec_rindices); - + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); /* TODO: vec recvdata is partially sorted, implementation of merge on GPU is * desirable */ __detail::local_sort(vec_recvdata, comp); MPI_Wait(&req_recvelems, MPI_STATUS_IGNORE); - + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); _total_elems = std::reduce(vec_recv_elems.begin(), vec_recv_elems.end()); /* prepare data for shift to neighboring processes */ std::vector vec_shift(_comm_size - 1); const auto desired_elems_num = (_total_elems + _comm_size - 1) / _comm_size; - + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); vec_shift[0] = desired_elems_num - vec_recv_elems[0]; for (std::size_t _i = 1; _i < _comm_size - 1; _i++) { vec_shift[_i] = vec_shift[_i - 1] + desired_elems_num - vec_recv_elems[_i]; @@ -493,17 +498,17 @@ void dist_sort(R &r, Compare &&comp) { /* shift data if necessary, to have exactly the number of elements equal to * lsegment size */ - // fmt::print("{}:{} before shift\n", default_comm().rank(), __LINE__); + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); __detail::shift_data(shift_left, shift_right, vec_recvdata, vec_left, vec_right); /* copy results to distributed vector's local segment */ - // fmt::print("{}:{} before copy\n", default_comm().rank(), __LINE__); + fmt::print("{}:{} before copy\n", default_comm().rank()fmt::print("{}:{}\n", default_comm().rank(), __LINE__); __detail::copy_results(lsegment, shift_left, shift_right, vec_recvdata, vec_left, vec_right); - + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); } // __detail::dist_sort } // namespace __detail @@ -547,11 +552,4 @@ void sort(RandomIt first, RandomIt last, Compare comp = Compare()) { sort(rng::subrange(first, last), comp); } -} // namespace dr::mhp - -template -struct sycl::is_device_copyable> : sycl::is_device_copyable {}; - -template <> -struct sycl::is_device_copyable> - : sycl::is_device_copyable {}; +} // namespace dr::mhp \ No newline at end of file diff --git a/include/dr/mhp/sycl_support.hpp b/include/dr/mhp/sycl_support.hpp index 21406e6432..0eef117a68 100644 --- a/include/dr/mhp/sycl_support.hpp +++ b/include/dr/mhp/sycl_support.hpp @@ -94,9 +94,9 @@ template void sycl_copy(T *src, T *dst, std::size_t size = 1) { sycl_copy(src, src + size, dst); } -template void sycl_copy(const T *src, const T *dst, std::size_t size = 1) { +template +void sycl_copy(const T *src, const T *dst, std::size_t size = 1) { sycl_copy(src, src + size, dst); } - } // namespace dr::mhp::__detail From cd0fca6fa1bad2316878749189c2bbc912d4d5a7 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Fri, 26 Jan 2024 12:49:08 +0100 Subject: [PATCH 4/7] update --- include/dr/mhp/algorithms/sort.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/dr/mhp/algorithms/sort.hpp b/include/dr/mhp/algorithms/sort.hpp index 71c91cd011..89d91a55bc 100644 --- a/include/dr/mhp/algorithms/sort.hpp +++ b/include/dr/mhp/algorithms/sort.hpp @@ -124,9 +124,11 @@ template void local_sort(R &r, Compare &&comp) { auto policy = dpl_policy(); auto &&local_segment = dr::ranges::__detail::local(r); DRLOG("GPU dpl::sort(), size {}\n", rng::size(r)); + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); oneapi::dpl::sort( policy, dr::__detail::direct_iterator(rng::begin(local_segment)), dr::__detail::direct_iterator(rng::end(local_segment)), comp); + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); #else assert(false); #endif @@ -135,6 +137,7 @@ template void local_sort(R &r, Compare &&comp) { rng::sort(rng::begin(r), rng::end(r), comp); } } + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); } /* elements of dist_sort */ @@ -474,7 +477,7 @@ void dist_sort(R &r, Compare &&comp) { /* TODO: vec recvdata is partially sorted, implementation of merge on GPU is * desirable */ __detail::local_sort(vec_recvdata, comp); - + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); MPI_Wait(&req_recvelems, MPI_STATUS_IGNORE); fmt::print("{}:{}\n", default_comm().rank(), __LINE__); _total_elems = std::reduce(vec_recv_elems.begin(), vec_recv_elems.end()); @@ -504,7 +507,7 @@ void dist_sort(R &r, Compare &&comp) { vec_right); /* copy results to distributed vector's local segment */ - fmt::print("{}:{} before copy\n", default_comm().rank()fmt::print("{}:{}\n", default_comm().rank(), __LINE__); + fmt::print("{}:{}\n", default_comm().rank(), __LINE__); __detail::copy_results(lsegment, shift_left, shift_right, vec_recvdata, vec_left, vec_right); From f597071678b976624562f81ceada19f6cabe7ba0 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 6 Feb 2024 14:30:36 +0100 Subject: [PATCH 5/7] cleanup --- include/dr/mhp/algorithms/sort.hpp | 65 +++++++++--------------------- 1 file changed, 18 insertions(+), 47 deletions(-) diff --git a/include/dr/mhp/algorithms/sort.hpp b/include/dr/mhp/algorithms/sort.hpp index 89d91a55bc..536a8b78d7 100644 --- a/include/dr/mhp/algorithms/sort.hpp +++ b/include/dr/mhp/algorithms/sort.hpp @@ -124,11 +124,9 @@ template void local_sort(R &r, Compare &&comp) { auto policy = dpl_policy(); auto &&local_segment = dr::ranges::__detail::local(r); DRLOG("GPU dpl::sort(), size {}\n", rng::size(r)); - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); oneapi::dpl::sort( policy, dr::__detail::direct_iterator(rng::begin(local_segment)), dr::__detail::direct_iterator(rng::end(local_segment)), comp); - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); #else assert(false); #endif @@ -137,7 +135,6 @@ template void local_sort(R &r, Compare &&comp) { rng::sort(rng::begin(r), rng::end(r), comp); } } - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); } /* elements of dist_sort */ @@ -145,7 +142,6 @@ template void splitters(Seg &lsegment, Compare &&comp, std::vector &vec_split_i, std::vector &vec_split_s) { - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); const std::size_t _comm_size = default_comm().size(); // dr-style ignore assert(rng::size(vec_split_i) == _comm_size); @@ -157,18 +153,18 @@ void splitters(Seg &lsegment, Compare &&comp, const double _step_m = static_cast(rng::size(lsegment)) / static_cast(_comm_size); + fmt::print("{}:{} splitters start\n", default_comm().rank(), __LINE__); + /* calculate splitting values and indices - find n-1 dividers splitting * each segment into equal parts */ if (mhp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); for (std::size_t _i = 0; _i < rng::size(vec_lmedians) - 1; _i++) { assert(_i * _step_m < rng::size(lsegment)); sycl_copy(&lsegment[_i * _step_m], &vec_lmedians[_i]); } sycl_copy(&lsegment[rng::size(lsegment) - 1], &vec_lmedians[rng::size(vec_lmedians) - 1]); - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); #else assert(false); #endif @@ -183,8 +179,6 @@ void splitters(Seg &lsegment, Compare &&comp, default_comm().all_gather(vec_lmedians, vec_gmedians); rng::sort(rng::begin(vec_gmedians), rng::end(vec_gmedians), comp); - fmt::print("{}:{} problem start\n", default_comm().rank(), __LINE__); - auto begin = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now(); @@ -192,8 +186,6 @@ void splitters(Seg &lsegment, Compare &&comp, * procedure; is optimisation possible? */ if (mhp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION - fmt::print("{}:{} sycl start\n", default_comm().rank(), __LINE__); - __detail::buffer dbuf_v(_comm_size - 1); __detail::buffer dbuf_i(rng::size(vec_split_i)); __detail::buffer dbuf_s(rng::size(vec_split_s)); @@ -215,7 +207,6 @@ void splitters(Seg &lsegment, Compare &&comp, sycl::buffer buf_lsegment(lsegment); begin = std::chrono::high_resolution_clock::now(); - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); sycl_queue() .submit([&](sycl::handler &h) { sycl::accessor acc_i{buf_i, h, sycl::read_write}; @@ -243,9 +234,8 @@ void splitters(Seg &lsegment, Compare &&comp, .wait(); end = std::chrono::high_resolution_clock::now(); - fmt::print("{}: splitters 2 duration {} ms\n", default_comm().rank(), + fmt::print("{}: splitters loop duration {} ms\n", default_comm().rank(), std::chrono::duration(end - begin).count() * 1000); - begin = std::chrono::high_resolution_clock::now(); sycl::host_accessor res_i{buf_i}, res_s{buf_s}; @@ -253,14 +243,11 @@ void splitters(Seg &lsegment, Compare &&comp, vec_split_i[_i] = sycl_get(res_i[_i]); vec_split_s[_i] = sycl_get(res_s[_i]); } - - // fmt::print("{}:{} after kernel s {} i {}\n", default_comm().rank(), - // __LINE__, vec_split_s, vec_split_i); #else assert(false); #endif } else { - fmt::print("{}:{} NO SYCL splitters\n", default_comm().rank(),__LINE__); + fmt::print("{}:{} NO SYCL splitters\n", default_comm().rank(), __LINE__); std::vector vec_split_v(_comm_size - 1); for (std::size_t _i = 0; _i < _comm_size - 1; _i++) { @@ -271,7 +258,7 @@ void splitters(Seg &lsegment, Compare &&comp, /* std::size_t segidx = 0, vidx = 1; while (vidx < _comm_size && segidx < rng::size(lsegment)) { - if (comp(vec_split_v.data()[vidx - 1], lsegment[segidx])) { + if (comp(vec_split_v[vidx - 1], lsegment[segidx])) { vec_split_i[vidx] = segidx; vec_split_s[vidx - 1] = vec_split_i[vidx] - vec_split_i[vidx - 1]; vidx++; @@ -295,15 +282,7 @@ void splitters(Seg &lsegment, Compare &&comp, assert(rng::size(lsegment) > vec_split_i[vidx - 1]); vec_split_s[vidx - 1] = rng::size(lsegment) - vec_split_i[vidx - 1]; } - - fmt::print("{}:{} after split s {} i {}\n", default_comm().rank(), - __LINE__, vec_split_s, vec_split_i); - - end = std::chrono::high_resolution_clock::now(); - fmt::print("{}: splitters 3 duration {} ms\n", default_comm().rank(), - std::chrono::duration(end - begin).count() * 1000); - fmt::print("{}:{} splitters done\n", default_comm().rank(), __LINE__); -} +} // splitters() template void shift_data(const int shift_left, const int shift_right, @@ -380,7 +359,7 @@ void shift_data(const int shift_left, const int shift_right, if (shift_right != 0) MPI_Wait(&req_r, &stat_r); } -} +} // shift_data() template void copy_results(auto &lsegment, const int shift_left, const int shift_right, @@ -426,7 +405,7 @@ void copy_results(auto &lsegment, const int shift_left, const int shift_right, rng::data(vec_recvdata) + invalidate_left, size_d * sizeof(valT)); } -} +} // copy_results template void dist_sort(R &r, Compare &&comp) { @@ -448,9 +427,7 @@ void dist_sort(R &r, Compare &&comp) { __detail::local_sort(lsegment, comp); /* find splitting values - limits of areas to send to other processes */ - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); __detail::splitters(lsegment, comp, vec_split_i, vec_split_s); - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); default_comm().alltoall(vec_split_s, vec_rsizes, 1); /* prepare data to send and receive */ @@ -461,11 +438,12 @@ void dist_sort(R &r, Compare &&comp) { /* send and receive data belonging to each node, then redistribute * data to achieve size of data equal to size of local segment */ - MPI_Request req_recvelems; - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); - default_comm().i_all_gather(_recv_elems, vec_recv_elems, &req_recvelems); + /* async version to consider and test (have issues with MPI_Wait() in the + * current CI)*/ + // MPI_Request req_recvelems; + // default_comm().i_all_gather(_recv_elems, vec_recv_elems, &req_recvelems); + default_comm().all_gather(_recv_elems, vec_recv_elems); - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); /* buffer for received data */ buffer vec_recvdata(_recv_elems); @@ -473,20 +451,18 @@ void dist_sort(R &r, Compare &&comp) { */ default_comm().alltoallv(lsegment, vec_split_s, vec_split_i, vec_recvdata, vec_rsizes, vec_rindices); - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); /* TODO: vec recvdata is partially sorted, implementation of merge on GPU is * desirable */ __detail::local_sort(vec_recvdata, comp); - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); - MPI_Wait(&req_recvelems, MPI_STATUS_IGNORE); - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); + + // MPI_Wait(&req_recvelems, MPI_STATUS_IGNORE); + _total_elems = std::reduce(vec_recv_elems.begin(), vec_recv_elems.end()); /* prepare data for shift to neighboring processes */ std::vector vec_shift(_comm_size - 1); const auto desired_elems_num = (_total_elems + _comm_size - 1) / _comm_size; - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); vec_shift[0] = desired_elems_num - vec_recv_elems[0]; for (std::size_t _i = 1; _i < _comm_size - 1; _i++) { vec_shift[_i] = vec_shift[_i - 1] + desired_elems_num - vec_recv_elems[_i]; @@ -501,17 +477,12 @@ void dist_sort(R &r, Compare &&comp) { /* shift data if necessary, to have exactly the number of elements equal to * lsegment size */ - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); - - __detail::shift_data(shift_left, shift_right, vec_recvdata, vec_left, + __detail::shift_data(shift_left, shift_right, vec_recvdata, vec_left, vec_right); /* copy results to distributed vector's local segment */ - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); - __detail::copy_results(lsegment, shift_left, shift_right, vec_recvdata, vec_left, vec_right); - fmt::print("{}:{}\n", default_comm().rank(), __LINE__); } // __detail::dist_sort } // namespace __detail @@ -555,4 +526,4 @@ void sort(RandomIt first, RandomIt last, Compare comp = Compare()) { sort(rng::subrange(first, last), comp); } -} // namespace dr::mhp \ No newline at end of file +} // namespace dr::mhp From 44074727991efd5f27def2dd14b85f96e5411d64 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 6 Feb 2024 16:10:45 +0100 Subject: [PATCH 6/7] update --- include/dr/mhp/algorithms/sort.hpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/dr/mhp/algorithms/sort.hpp b/include/dr/mhp/algorithms/sort.hpp index 536a8b78d7..92d88b29e7 100644 --- a/include/dr/mhp/algorithms/sort.hpp +++ b/include/dr/mhp/algorithms/sort.hpp @@ -179,8 +179,8 @@ void splitters(Seg &lsegment, Compare &&comp, default_comm().all_gather(vec_lmedians, vec_gmedians); rng::sort(rng::begin(vec_gmedians), rng::end(vec_gmedians), comp); - auto begin = std::chrono::high_resolution_clock::now(); - auto end = std::chrono::high_resolution_clock::now(); + // auto begin = std::chrono::high_resolution_clock::now(); + // auto end = std::chrono::high_resolution_clock::now(); /* TODO: the loop below takes most of time of the whole sort * procedure; is optimisation possible? */ @@ -206,7 +206,6 @@ void splitters(Seg &lsegment, Compare &&comp, sycl::buffer buf_v(dbuf_v.data(), sycl::range(rng::size(dbuf_v))); sycl::buffer buf_lsegment(lsegment); - begin = std::chrono::high_resolution_clock::now(); sycl_queue() .submit([&](sycl::handler &h) { sycl::accessor acc_i{buf_i, h, sycl::read_write}; @@ -233,9 +232,9 @@ void splitters(Seg &lsegment, Compare &&comp, }) .wait(); - end = std::chrono::high_resolution_clock::now(); - fmt::print("{}: splitters loop duration {} ms\n", default_comm().rank(), - std::chrono::duration(end - begin).count() * 1000); + // end = std::chrono::high_resolution_clock::now(); + // fmt::print("{}: splitters loop duration {} ms\n", default_comm().rank(), + // std::chrono::duration(end - begin).count() * 1000); sycl::host_accessor res_i{buf_i}, res_s{buf_s}; From 55948d296ce8a9117696e0361b9d47a2c81f9db8 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Wed, 7 Feb 2024 12:04:26 +0100 Subject: [PATCH 7/7] format fix --- include/dr/mhp/algorithms/sort.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/dr/mhp/algorithms/sort.hpp b/include/dr/mhp/algorithms/sort.hpp index 92d88b29e7..0d1e91c3c3 100644 --- a/include/dr/mhp/algorithms/sort.hpp +++ b/include/dr/mhp/algorithms/sort.hpp @@ -476,7 +476,7 @@ void dist_sort(R &r, Compare &&comp) { /* shift data if necessary, to have exactly the number of elements equal to * lsegment size */ - __detail::shift_data(shift_left, shift_right, vec_recvdata, vec_left, + __detail::shift_data(shift_left, shift_right, vec_recvdata, vec_left, vec_right); /* copy results to distributed vector's local segment */