rm some funcs by using is_span_like, switch to expose each GPU impl explicitly

PhilipDeegan · PhilipDeegan · commit 950a7b9eb9c7 · 2020-08-05T19:42:55.000+02:00
diff --git a/README.md b/README.md
@@ -1,2 +1,3 @@
 
-# mkn.gpu
+
+# mkn.gpu
diff --git a/README.noformat b/README.noformat
@@ -0,0 +1,27 @@
+mkn.gpu
+
+CUDA/HIP C++17 convenience wrappers
+
+======
+
+Compile argument switches
+
+Key             KUL_GPU_CUDA
+Type            bool
+Default         0
+Example         mkn cuda profile
+Description     activate CUDA as impl of kul::gpu::*
+
+Key             KUL_GPU_ROCM
+Type            bool
+Default         0
+Example         mkn rocm profile
+Description     activate ROCM as impl of kul::gpu::*
+
+Key             KUL_GPU_FN_PER_NS
+Type            bool
+Default         0
+Example         test/hip/add.cpp or test/cuda/add.cpp
+Description     expose functions explicitly via
+                   kul::gpu::hip::*
+                   kul::gpu::cuda::*
diff --git a/inc/kul/gpu.hpp b/inc/kul/gpu.hpp
@@ -35,7 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "kul/gpu/rocm.hpp"
 #elif defined(KUL_GPU_CUDA)
 #include "kul/gpu/cuda.hpp"
-#else
+#elif !defined(KUL_GPU_FN_PER_NS) || KUL_GPU_FN_PER_NS == 0
 #error "UNKNOWN GPU / define KUL_GPU_ROCM or KUL_GPU_CUDA"
 #endif
 
diff --git a/inc/kul/gpu/cuda.hpp b/inc/kul/gpu/cuda.hpp
@@ -37,11 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "kul/assert.hpp"
 #include "kul/tuple.hpp"
 
-#include "kul/gpu/cuda/def.hpp"
+#include "kul/gpu/def.hpp"
 
 #define KUL_GPU_ASSERT(x) (KASSERT((x) == cudaSuccess))
 
 namespace kul::gpu {
+#if defined(KUL_GPU_FN_PER_NS) && KUL_GPU_FN_PER_NS
+namespace cuda {
+#endif  // KUL_GPU_FN_PER_NS
 
 //
 void prinfo(size_t dev = 0) {
@@ -59,8 +62,6 @@ void prinfo(size_t dev = 0) {
 
 template <typename T, typename SIZE = uint32_t>
 struct DeviceMem {
-  using Span = kul::Span<T, SIZE>;
-  using Span_ct = kul::Span<T const, SIZE>;
 
   DeviceMem() {}
   DeviceMem(SIZE _s) : s{_s}, owned{true} {
@@ -70,12 +71,8 @@ struct DeviceMem {
   }
 
   DeviceMem(T const* const t, SIZE _s) : DeviceMem{_s} { send(t, _s); }
-  DeviceMem(Span const& s) : DeviceMem{s.data(), s.size()} {}
-  DeviceMem(Span&& s) : DeviceMem{s} {}
-  DeviceMem(Span_ct const& s) : DeviceMem{s.data(), s.size()} {}
-  DeviceMem(Span_ct&& s) : DeviceMem{s} {}
-  DeviceMem(std::vector<T> const& v) : DeviceMem{&v[0], static_cast<SIZE>(v.size())} {}
-  DeviceMem(std::vector<T>&& v) : DeviceMem{v} {}
+  template <typename C, std::enable_if_t<kul::is_span_like_v<C>, bool> = 0>
+  DeviceMem(C c) : DeviceMem{c.data(), static_cast<SIZE>(c.size())} {}
 
   ~DeviceMem() {
     if (p && s && owned) KUL_GPU_ASSERT(cudaFree(p));
@@ -84,23 +81,18 @@ struct DeviceMem {
   void send(T const* const t, SIZE _size = 1, SIZE start = 0) {
     KUL_GPU_ASSERT(cudaMemcpy(p + start, t, _size * sizeof(T), cudaMemcpyHostToDevice));
   }
-
-  void send(Span const& s, SIZE start = 0) { send(s.data(), s.size(), start); }
-  void send(Span&& s, SIZE start = 0) { send(s, start); }
-
-  void send(Span_ct const& s, SIZE start = 0) { send(s.data(), s.size(), start); }
-  void send(Span_ct&& s, SIZE start = 0) { send(s, start); }
-
-  void send(std::vector<T> const& v, SIZE start = 0) { send(&v[0], v.size(), start); }
-  void send(std::vector<T>&& v, SIZE start = 0) { send(v, start); }
+  template <typename C, std::enable_if_t<kul::is_span_like_v<C>, bool> = 0>
+  void send(C c, SIZE start = 0) {
+    send(c.data(), c.size(), start);
+  }
 
   void fill_n(T t, SIZE _size, SIZE start = 0) {
     // TODO - improve with memSet style
     assert(_size + start <= s);
     send(std::vector<T>(_size, t), start);
   }
 
-  decltype(auto) operator+(size_t size) {
+  DeviceMem<T> operator+(size_t size) {
     DeviceMem<T> view;
     view.p = this->p + size;
     view.s = this->s - size;
@@ -175,7 +167,7 @@ struct ADeviceClass<false> {
 template <bool GPU>
 struct DeviceClass : ADeviceClass<GPU> {
   template <typename T, typename SIZE = uint32_t>
-  using container_t = std::conditional_t<GPU, T*, kul::gpu::DeviceMem<T, SIZE>>;
+  using container_t = std::conditional_t<GPU, T*, DeviceMem<T, SIZE>>;
 };
 
 namespace {
@@ -207,26 +199,29 @@ struct Launcher {
 
   template <typename F, typename... Args>
   void operator()(F f, Args&&... args) {
-    kul::gpu::sync();
-    std::apply([&](auto&&... params) {
-      f<<<g, b, ds, s>>>(params...);
-    }, devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
+    sync();
+    std::apply([&](auto&&... params) { f<<<g, b, ds, s>>>(params...); },
+               devmem_replace(std::forward_as_tuple(args...),
+                              std::make_index_sequence<sizeof...(Args)>()));
   }
   size_t ds = 0 /*dynamicShared*/;
   dim3 g /*gridDim*/, b /*blockDim*/;
   cudaStream_t s = 0;
 };
 
 template <typename T, typename V>
-void fill_n(kul::gpu::DeviceMem<T>& p, size_t size, V val) {
+void fill_n(DeviceMem<T>& p, size_t size, V val) {
   p.fill_n(val, size);
 }
 
 template <typename T, typename V>
-void fill_n(kul::gpu::DeviceMem<T>&& p, size_t size, V val) {
+void fill_n(DeviceMem<T>&& p, size_t size, V val) {
   fill_n(p, size, val);
 }
 
+#if defined(KUL_GPU_FN_PER_NS) && KUL_GPU_FN_PER_NS
+} /* namespace cuda */
+#endif  // KUL_GPU_FN_PER_NS
 } /* namespace kul::gpu */
 
 #undef KUL_GPU_ASSERT
diff --git a/inc/kul/gpu/cuda/def.hpp b/inc/kul/gpu/cuda/def.hpp
@@ -3,6 +3,8 @@
 #ifndef _KUL_GPU_CUDA_DEF_HPP_
 #define _KUL_GPU_CUDA_DEF_HPP_
 
+#include <cuda_runtime.h>
+
 namespace kul::gpu::cuda {
 
 __device__ uint32_t idx() {
diff --git a/inc/kul/gpu/def.hpp b/inc/kul/gpu/def.hpp
@@ -3,32 +3,22 @@
 #ifndef _KUL_GPU_DEF_HPP_
 #define _KUL_GPU_DEF_HPP_
 
+#include <type_traits>
 
 #if defined(KUL_GPU_ROCM)
-#include "kul/gpu/rocm.hpp"
+#include "kul/gpu/rocm/def.hpp"
 #elif defined(KUL_GPU_CUDA)
-#include "kul/gpu/cuda.hpp"
-#else
+#include "kul/gpu/cuda/def.hpp"
+#elif !defined(KUL_GPU_FN_PER_NS) || KUL_GPU_FN_PER_NS == 0
 #error "UNKNOWN GPU / define KUL_GPU_ROCM or KUL_GPU_CUDA"
 #endif
 
-
 namespace kul::gpu {
 
 template <typename T>
 static constexpr bool is_floating_point_v =
     std::is_floating_point_v<T> or std::is_same_v<_Float16, T>;
 
-__device__ uint32_t idx() {
-#if defined(KUL_GPU_ROCM)
-  return kul::gpu::hip::idx();
-#elif defined(KUL_GPU_CUDA)
-  return kul::gpu::cuda::idx();
-#else
-#error "UNKNOWN GPU / define KUL_GPU_ROCM or KUL_GPU_CUDA"
-#endif
-}
-
 } /* namespace kul::gpu */
 
 #endif /*_KUL_GPU_DEF_HPP_*/
diff --git a/inc/kul/gpu/rocm.hpp b/inc/kul/gpu/rocm.hpp
@@ -37,11 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "kul/assert.hpp"
 #include "kul/tuple.hpp"
 
-#include "kul/gpu/rocm/def.hpp"
+#include "kul/gpu/def.hpp"
 
 #define KUL_GPU_ASSERT(x) (KASSERT((x) == hipSuccess))
 
 namespace kul::gpu {
+#if defined(KUL_GPU_FN_PER_NS) && KUL_GPU_FN_PER_NS
+namespace hip {
+#endif  // KUL_GPU_FN_PER_NS
 
 // https://rocm-developer-tools.github.io/HIP/group__Device.html
 void prinfo(size_t dev = 0) {
@@ -59,23 +62,17 @@ void prinfo(size_t dev = 0) {
 
 template <typename T, typename SIZE = uint32_t>
 struct DeviceMem {
-  using Span = kul::Span<T, SIZE>;
-  using Span_ct = kul::Span<T const, SIZE>;
 
   DeviceMem() {}
   DeviceMem(SIZE _s) : s{_s}, owned{true} {
     SIZE alloc_bytes = s * sizeof(T);
     KLOG(OTH) << "GPU alloced: " << alloc_bytes;
-    KUL_GPU_ASSERT(hipMalloc((void**)&p, alloc_bytes));
+    if (s) KUL_GPU_ASSERT(hipMalloc((void**)&p, alloc_bytes));
   }
 
   DeviceMem(T const* const t, SIZE _s) : DeviceMem{_s} { send(t, _s); }
-  DeviceMem(Span const& s) : DeviceMem{s.data(), s.size()} {}
-  DeviceMem(Span&& s) : DeviceMem{s} {}
-  DeviceMem(Span_ct const& s) : DeviceMem{s.data(), s.size()} {}
-  DeviceMem(Span_ct&& s) : DeviceMem{s} {}
-  DeviceMem(std::vector<T> const& v) : DeviceMem{&v[0], static_cast<SIZE>(v.size())} {}
-  DeviceMem(std::vector<T>&& v) : DeviceMem{v} {}
+  template <typename C, std::enable_if_t<kul::is_span_like_v<C>, bool> = 0>
+  DeviceMem(C c) : DeviceMem{c.data(), static_cast<SIZE>(c.size())} {}
 
   ~DeviceMem() {
     if (p && s && owned) KUL_GPU_ASSERT(hipFree(p));
@@ -84,23 +81,18 @@ struct DeviceMem {
   void send(T const* const t, SIZE _size = 1, SIZE start = 0) {
     KUL_GPU_ASSERT(hipMemcpy(p + start, t, _size * sizeof(T), hipMemcpyHostToDevice));
   }
-
-  void send(Span const& s, SIZE start = 0) { send(s.data(), s.size(), start); }
-  void send(Span&& s, SIZE start = 0) { send(s, start); }
-
-  void send(Span_ct const& s, SIZE start = 0) { send(s.data(), s.size(), start); }
-  void send(Span_ct&& s, SIZE start = 0) { send(s, start); }
-
-  void send(std::vector<T> const& v, SIZE start = 0) { send(&v[0], v.size(), start); }
-  void send(std::vector<T>&& v, SIZE start = 0) { send(v, start); }
+  template <typename C, std::enable_if_t<kul::is_span_like_v<C>, bool> = 0>
+  void send(C c, SIZE start = 0) {
+    send(c.data(), c.size(), start);
+  }
 
   void fill_n(T t, SIZE _size, SIZE start = 0) {
     // TODO - improve with memSet style
     assert(_size + start <= s);
     send(std::vector<T>(_size, t), start);
   }
 
-  decltype(auto) operator+(size_t size) {
+  DeviceMem<T> operator+(size_t size) {
     DeviceMem<T> view;
     view.p = this->p + size;
     view.s = this->s - size;
@@ -175,7 +167,7 @@ struct ADeviceClass<false> {
 template <bool GPU>
 struct DeviceClass : ADeviceClass<GPU> {
   template <typename T, typename SIZE = uint32_t>
-  using container_t = std::conditional_t<GPU, T*, kul::gpu::DeviceMem<T, SIZE>>;
+  using container_t = std::conditional_t<GPU, T*, DeviceMem<T, SIZE>>;
 };
 
 namespace {
@@ -207,26 +199,29 @@ struct Launcher {
 
   template <typename F, typename... Args>
   void operator()(F f, Args&&... args) {
-    kul::gpu::sync();
-    std::apply([&](auto&&... params) {
-      hipLaunchKernelGGL(f, g, b, ds, s, params...);
-    }, devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
+    sync();
+    std::apply([&](auto&&... params) { hipLaunchKernelGGL(f, g, b, ds, s, params...); },
+               devmem_replace(std::forward_as_tuple(args...),
+                              std::make_index_sequence<sizeof...(Args)>()));
   }
   size_t ds = 0 /*dynamicShared*/;
   dim3 g /*gridDim*/, b /*blockDim*/;
   hipStream_t s = 0;
 };
 
 template <typename T, typename V>
-void fill_n(kul::gpu::DeviceMem<T>& p, size_t size, V val) {
+void fill_n(DeviceMem<T>& p, size_t size, V val) {
   p.fill_n(val, size);
 }
 
 template <typename T, typename V>
-void fill_n(kul::gpu::DeviceMem<T>&& p, size_t size, V val) {
+void fill_n(DeviceMem<T>&& p, size_t size, V val) {
   fill_n(p, size, val);
 }
 
+#if defined(KUL_GPU_FN_PER_NS) && KUL_GPU_FN_PER_NS
+} /* namespace hip */
+#endif  // KUL_GPU_FN_PER_NS
 } /* namespace kul::gpu */
 
 #undef KUL_GPU_ASSERT
diff --git a/inc/kul/gpu/rocm/def.hpp b/inc/kul/gpu/rocm/def.hpp
@@ -3,6 +3,8 @@
 #ifndef _KUL_GPU_ROCM_DEF_HPP_
 #define _KUL_GPU_ROCM_DEF_HPP_
 
+#include "hip/hip_runtime.h"
+
 namespace kul::gpu::hip {
 
 __device__ uint32_t idx() {
diff --git a/mkn.yaml b/mkn.yaml
@@ -8,14 +8,14 @@ profile:
   inc: inc
   dep: mkn.kul
 
-- name: test
-  parent: headers
-  test: test/(\w).cpp
-
 - name: rocm
-  parent: test
+  parent: headers
   arg: -DKUL_GPU_ROCM
+  test: test/any/(\w).cpp
+        test/hip/(\w).cpp
 
 - name: cuda
-  parent: test
+  parent: headers
   arg: -DKUL_GPU_CUDA
+  test: test/any/(\w).cpp
+        test/cuda/(\w).cpp
diff --git a/test/any/add.cpp b/test/any/add.cpp
@@ -26,6 +26,6 @@ size_t test(){
 }
 
 int main() {
-  kul::gpu::prinfo();
+  KOUT(NON) << __FILE__;
   return test<float>() + test<double>();
 }
diff --git a/test/any/class.cpp b/test/any/class.cpp
@@ -64,6 +64,6 @@ size_t test(){
 }
 
 int main() {
-  kul::gpu::prinfo();
+  KOUT(NON) << __FILE__;
   return test<float>() + test<double>();
 }
diff --git a/test/any/info.cpp b/test/any/info.cpp
@@ -0,0 +1,8 @@
+
+#include "kul/gpu.hpp"
+
+int main() {
+  KOUT(NON) << __FILE__;
+  kul::gpu::prinfo();
+  return 0;
+}
diff --git a/test/cuda/add.cpp b/test/cuda/add.cpp
diff --git a/test/hip/add.cpp b/test/hip/add.cpp

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,6 @@ size_t test(){`
`26`	`26`	`}`
`27`	`27`
`28`	`28`	`int main() {`
`29`		`- kul::gpu::prinfo();`
	`29`	`+ KOUT(NON) << __FILE__;`
`30`	`30`	`return test<float>() + test<double>();`
`31`	`31`	`}`
Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,6 @@ size_t test(){`
`64`	`64`	`}`
`65`	`65`
`66`	`66`	`int main() {`
`67`		`- kul::gpu::prinfo();`
	`67`	`+ KOUT(NON) << __FILE__;`
`68`	`68`	`return test<float>() + test<double>();`
`69`	`69`	`}`