threaded stream launcher group barrier sync points (#22)

PhilipDeegan · web-flow · commit 0a5fdf26704f · 2024-09-28T14:01:13.000+02:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -1,5 +1,9 @@
 name: ubuntu-latest
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 on:
   push:
     branches: [ master ]
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 bin
 .mkn
 *.cui
+.clangd
diff --git a/README.noformat b/README.noformat
@@ -4,24 +4,40 @@ CUDA/HIP C++17 convenience wrappers
 
 ======
 
+Whether you are using CUDA or ROCM, we attempt to deduce from available headers.
+If automatic detection fails, specify appropriate define like `-DMKN_GPU_CUDA=1`
+See: inc/mkn/gpu/defines.hpp
+
 Compile argument switches
 
 Key             MKN_GPU_CUDA
 Type            bool
 Default         0
-Example         mkn cuda profile
 Description     activate CUDA as impl of mkn::gpu::*
 
 Key             MKN_GPU_ROCM
 Type            bool
 Default         0
-Example         mkn rocm profile
 Description     activate ROCM as impl of mkn::gpu::*
 
 Key             MKN_GPU_FN_PER_NS
 Type            bool
 Default         0
-Example         test/hip/add.cpp or test/cuda/add.cpp
 Description     expose functions explicitly via
                    mkn::gpu::hip::*
                    mkn::gpu::cuda::*
+
+Key             _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_
+Type            uint
+Default         1
+Description     Initial wait time in milliseconds for polling active jobs for completion
+
+Key             _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_
+Type            uint
+Default         10
+Description     Additional wait time in milliseconds for polling active jobs for completion when no job is finished.
+
+Key             _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_
+Type            uint
+Default         100
+Description     Max wait time in milliseconds for polling active jobs for completion when no job is finished.
diff --git a/inc/mkn/gpu/def.hpp b/inc/mkn/gpu/def.hpp
@@ -18,6 +18,18 @@ static constexpr bool is_floating_point_v =
 
 #endif
 
+#ifndef _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_
+#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ 1
+#endif /*_MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_    */
+
+#ifndef _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_
+#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_ 10
+#endif /*_MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_    */
+
+#ifndef _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_
+#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_ 100
+#endif /*_MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_    */
+
 } /* namespace mkn::gpu */
 
 #endif /*_MKN_GPU_DEF_HPP_*/
diff --git a/inc/mkn/gpu/defines.hpp b/inc/mkn/gpu/defines.hpp
@@ -1,10 +1,6 @@
-
-
 #ifndef _MKN_GPU_DEFINES_HPP_
 #define _MKN_GPU_DEFINES_HPP_
 
-#include <type_traits>
-
 #if !defined(MKN_GPU_FN_PER_NS)
 #define MKN_GPU_FN_PER_NS 0
 #endif
diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp
@@ -38,8 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <barrier>
 #include <cassert>
 #include <cstdint>
-#include <iostream>
 #include <algorithm>
+#include <stdexcept>
 
 #include "mkn/gpu.hpp"
 
@@ -135,7 +135,6 @@ struct StreamLauncher {
     assert(step < fns.size());
     assert(i < events.size());
 
-    // if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) events[i].stream.sync();
     fns[step]->run(i);
     if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record().wait();
   }
@@ -203,6 +202,55 @@ struct StreamBarrierFunction : StreamFunction<Strat> {
   std::barrier<decltype(on_completion)> sync_point;
 };
 
+template <typename Strat>
+struct StreamGroupBarrierFunction : StreamFunction<Strat> {
+  using This = StreamGroupBarrierFunction<Strat>;
+  using Super = StreamFunction<Strat>;
+  using Super::strat;
+
+  std::string_view constexpr static MOD_GROUP_ERROR =
+      "mkn.gpu error: StreamGroupBarrierFunction Group size must be a divisor of datas";
+
+  struct GroupBarrier {
+    This* self;
+    std::uint16_t group_id;
+
+    std::function<void()> on_completion = [this]() {
+      std::size_t const offset = self->group_size * group_id;
+      for (std::size_t i = offset; i < offset + self->group_size; ++i)
+        self->strat.status[i] = SFS::WAIT;
+    };
+
+    std::barrier<decltype(on_completion)> sync_point{static_cast<std::int64_t>(self->group_size),
+                                                     on_completion};
+
+    GroupBarrier(This& slf, std::uint16_t const gid) : self{&slf}, group_id{gid} {}
+    void arrive() { [[maybe_unused]] auto ret = sync_point.arrive(); }
+  };
+
+  static auto make_sync_points(This& self, Strat const& strat, std::size_t const& group_size) {
+    if (strat.datas.size() % group_size > 0) throw std::runtime_error(std::string{MOD_GROUP_ERROR});
+    std::vector<std::unique_ptr<GroupBarrier>> v;
+    std::uint16_t const groups = strat.datas.size() / group_size;
+    v.reserve(groups);
+    for (std::size_t i = 0; i < groups; ++i)
+      v.emplace_back(std::make_unique<GroupBarrier>(self, i));
+    return std::move(v);
+  }
+
+  StreamGroupBarrierFunction(std::size_t const& gs, Strat& strat)
+      : Super{strat, StreamFunctionMode::BARRIER},
+        group_size{gs},
+        sync_points{make_sync_points(*this, strat, group_size)} {}
+
+  void run(std::uint32_t const i) override {
+    sync_points[((i - (i % group_size)) / group_size)]->arrive();
+  }
+
+  std::size_t const group_size;
+  std::vector<std::unique_ptr<GroupBarrier>> sync_points;
+};
+
 template <typename Datas>
 struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLauncher<Datas>> {
   using This = ThreadedStreamLauncher<Datas>;
@@ -211,8 +259,9 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
   using Super::events;
   using Super::fns;
 
-  constexpr static std::size_t wait_ms = 1;
-  constexpr static std::size_t wait_max_ms = 100;
+  constexpr static std::size_t wait_ms = _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_;
+  constexpr static std::size_t wait_add_ms = _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_;
+  constexpr static std::size_t wait_max_ms = _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_;
 
   ThreadedStreamLauncher(Datas& datas, std::size_t const _n_threads = 1,
                          std::size_t const device = 0)
@@ -235,6 +284,11 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
     return *this;
   }
 
+  This& group_barrier(std::size_t const& group_size) {
+    fns.emplace_back(std::make_shared<StreamGroupBarrierFunction<This>>(group_size, *this));
+    return *this;
+  }
+
   void operator()() { join(); }
   Super& super() { return *this; }
   void super(std::size_t const& idx) { return super()(idx); }
@@ -264,7 +318,7 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
       }
 
       std::this_thread::sleep_for(std::chrono::milliseconds(waitms));
-      waitms = waitms >= wait_max_ms ? wait_max_ms : waitms + 10;
+      waitms = waitms >= wait_max_ms ? wait_max_ms : waitms + wait_add_ms;
     }
   }
 
diff --git a/mkn.yaml b/mkn.yaml
@@ -10,13 +10,11 @@ profile:
 
 - name: rocm
   parent: headers
-  # arg: -DMKN_GPU_ROCM=1
   test: test/any/(\w).cpp
         test/hip/(\w).cpp
 
 - name: cuda
   parent: headers
-  # arg: -DMKN_GPU_CUDA
   test: test/any/(\w).cpp
         test/cuda/(\w).cpp
 
diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp
@@ -1,13 +1,12 @@
 
-#include <cassert>
-#include <chrono>
-#include <iostream>
 #include <thread>
 #include <algorithm>
 
 #include "mkn/kul/dbg.hpp"
+#include "mkn/kul/time.hpp"
 #include "mkn/gpu/multi_launch.hpp"
 
+using namespace mkn::gpu;
 using namespace std::chrono_literals;
 
 std::uint32_t static constexpr NUM = 128 * 1024;  // ~ 1MB of doubles
@@ -21,10 +20,8 @@ struct A {
 };
 
 std::uint32_t test() {
-  using namespace mkn::gpu;
-  using T = double;
-
   KUL_DBG_FUNC_ENTER;
+  using T = double;
 
   std::vector<ManagedVector<T>> vecs(C, ManagedVector<T>(NUM, 0));
   for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);
@@ -33,13 +30,17 @@ std::uint32_t test() {
   for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data();
   auto views = datas.data();
 
+  auto const start = mkn::kul::Now::MILLIS();
   StreamLauncher{vecs}
       .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; })
       .host([&](auto i) mutable {
         std::this_thread::sleep_for(200ms);
         for (auto& e : vecs[i]) e += 1;
       })
       .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })();
+  auto const end = mkn::kul::Now::MILLIS();
+
+  if (end - start > 1.5e3) return 1;
 
   std::size_t val = 5;
   for (auto const& vec : vecs) {
@@ -52,10 +53,8 @@ std::uint32_t test() {
 }
 
 std::uint32_t test_threaded(std::size_t const& nthreads = 2) {
-  using namespace mkn::gpu;
-  using T = double;
-
   KUL_DBG_FUNC_ENTER;
+  using T = double;
 
   std::vector<ManagedVector<T>> vecs(C, ManagedVector<T>(NUM, 0));
   for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);
@@ -64,8 +63,6 @@ std::uint32_t test_threaded(std::size_t const& nthreads = 2) {
   for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data();
   auto views = datas.data();
 
-  using namespace std::chrono_literals;
-
   ThreadedStreamLauncher{vecs, nthreads}
       .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; })
       .host([&](auto i) mutable {
@@ -85,7 +82,41 @@ std::uint32_t test_threaded(std::size_t const& nthreads = 2) {
   return 0;
 }
 
+std::uint32_t test_threaded_group_barrier(std::size_t const& nthreads = 2) {
+  using T = double;
+  KUL_DBG_FUNC_ENTER;
+
+  std::vector<ManagedVector<T>> vecs(C + 1, ManagedVector<T>(NUM, 0));
+  for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);
+
+  ManagedVector<T*> datas(C + 1);
+  for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data();
+  auto views = datas.data();
+
+  auto const start = mkn::kul::Now::MILLIS();
+  ThreadedStreamLauncher{vecs, nthreads}
+      .dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 1; })
+      .host([&](auto i) mutable {
+        std::this_thread::sleep_for(200ms);
+        for (auto& e : vecs[i]) e += 1;
+      })
+      .group_barrier(3)
+      .dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 3; })();
+  auto const end = mkn::kul::Now::MILLIS();
+
+  if (end - start > 1e3) return 1;
+
+  std::size_t val = 5;
+  for (auto const& vec : vecs) {
+    for (auto const& e : vec)
+      if (e != val) return 1;
+    ++val;
+  };
+
+  return 0;
+}
+
 int main() {
   KOUT(NON) << __FILE__;
-  return test() + test_threaded() + test_threaded(6);
+  return test() + test_threaded() + test_threaded(6) + test_threaded_group_barrier();
 }

-Original file line number
+Diff line change
 bin
 .mkn
 *.cui
 +.clangd