Skip to content

Commit 950a7b9

Browse files
committed
rm some funcs by using is_span_like, switch to expose each GPU impl explicitly
1 parent 8747d99 commit 950a7b9

File tree

14 files changed

+159
-77
lines changed

14 files changed

+159
-77
lines changed

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11

2-
# mkn.gpu
2+
3+
# mkn.gpu

README.noformat

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
mkn.gpu
2+
3+
CUDA/HIP C++17 convenience wrappers
4+
5+
======
6+
7+
Compile argument switches
8+
9+
Key KUL_GPU_CUDA
10+
Type bool
11+
Default 0
12+
Example mkn cuda profile
13+
Description activate CUDA as impl of kul::gpu::*
14+
15+
Key KUL_GPU_ROCM
16+
Type bool
17+
Default 0
18+
Example mkn rocm profile
19+
Description activate ROCM as impl of kul::gpu::*
20+
21+
Key KUL_GPU_FN_PER_NS
22+
Type bool
23+
Default 0
24+
Example test/hip/add.cpp or test/cuda/add.cpp
25+
Description expose functions explicitly via
26+
kul::gpu::hip::*
27+
kul::gpu::cuda::*

inc/kul/gpu.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3535
#include "kul/gpu/rocm.hpp"
3636
#elif defined(KUL_GPU_CUDA)
3737
#include "kul/gpu/cuda.hpp"
38-
#else
38+
#elif !defined(KUL_GPU_FN_PER_NS) || KUL_GPU_FN_PER_NS == 0
3939
#error "UNKNOWN GPU / define KUL_GPU_ROCM or KUL_GPU_CUDA"
4040
#endif
4141

inc/kul/gpu/cuda.hpp

+21-26
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3737
#include "kul/assert.hpp"
3838
#include "kul/tuple.hpp"
3939

40-
#include "kul/gpu/cuda/def.hpp"
40+
#include "kul/gpu/def.hpp"
4141

4242
#define KUL_GPU_ASSERT(x) (KASSERT((x) == cudaSuccess))
4343

4444
namespace kul::gpu {
45+
#if defined(KUL_GPU_FN_PER_NS) && KUL_GPU_FN_PER_NS
46+
namespace cuda {
47+
#endif // KUL_GPU_FN_PER_NS
4548

4649
//
4750
void prinfo(size_t dev = 0) {
@@ -59,8 +62,6 @@ void prinfo(size_t dev = 0) {
5962

6063
template <typename T, typename SIZE = uint32_t>
6164
struct DeviceMem {
62-
using Span = kul::Span<T, SIZE>;
63-
using Span_ct = kul::Span<T const, SIZE>;
6465

6566
DeviceMem() {}
6667
DeviceMem(SIZE _s) : s{_s}, owned{true} {
@@ -70,12 +71,8 @@ struct DeviceMem {
7071
}
7172

7273
DeviceMem(T const* const t, SIZE _s) : DeviceMem{_s} { send(t, _s); }
73-
DeviceMem(Span const& s) : DeviceMem{s.data(), s.size()} {}
74-
DeviceMem(Span&& s) : DeviceMem{s} {}
75-
DeviceMem(Span_ct const& s) : DeviceMem{s.data(), s.size()} {}
76-
DeviceMem(Span_ct&& s) : DeviceMem{s} {}
77-
DeviceMem(std::vector<T> const& v) : DeviceMem{&v[0], static_cast<SIZE>(v.size())} {}
78-
DeviceMem(std::vector<T>&& v) : DeviceMem{v} {}
74+
template <typename C, std::enable_if_t<kul::is_span_like_v<C>, bool> = 0>
75+
DeviceMem(C c) : DeviceMem{c.data(), static_cast<SIZE>(c.size())} {}
7976

8077
~DeviceMem() {
8178
if (p && s && owned) KUL_GPU_ASSERT(cudaFree(p));
@@ -84,23 +81,18 @@ struct DeviceMem {
8481
void send(T const* const t, SIZE _size = 1, SIZE start = 0) {
8582
KUL_GPU_ASSERT(cudaMemcpy(p + start, t, _size * sizeof(T), cudaMemcpyHostToDevice));
8683
}
87-
88-
void send(Span const& s, SIZE start = 0) { send(s.data(), s.size(), start); }
89-
void send(Span&& s, SIZE start = 0) { send(s, start); }
90-
91-
void send(Span_ct const& s, SIZE start = 0) { send(s.data(), s.size(), start); }
92-
void send(Span_ct&& s, SIZE start = 0) { send(s, start); }
93-
94-
void send(std::vector<T> const& v, SIZE start = 0) { send(&v[0], v.size(), start); }
95-
void send(std::vector<T>&& v, SIZE start = 0) { send(v, start); }
84+
template <typename C, std::enable_if_t<kul::is_span_like_v<C>, bool> = 0>
85+
void send(C c, SIZE start = 0) {
86+
send(c.data(), c.size(), start);
87+
}
9688

9789
void fill_n(T t, SIZE _size, SIZE start = 0) {
9890
// TODO - improve with memSet style
9991
assert(_size + start <= s);
10092
send(std::vector<T>(_size, t), start);
10193
}
10294

103-
decltype(auto) operator+(size_t size) {
95+
DeviceMem<T> operator+(size_t size) {
10496
DeviceMem<T> view;
10597
view.p = this->p + size;
10698
view.s = this->s - size;
@@ -175,7 +167,7 @@ struct ADeviceClass<false> {
175167
template <bool GPU>
176168
struct DeviceClass : ADeviceClass<GPU> {
177169
template <typename T, typename SIZE = uint32_t>
178-
using container_t = std::conditional_t<GPU, T*, kul::gpu::DeviceMem<T, SIZE>>;
170+
using container_t = std::conditional_t<GPU, T*, DeviceMem<T, SIZE>>;
179171
};
180172

181173
namespace {
@@ -207,26 +199,29 @@ struct Launcher {
207199

208200
template <typename F, typename... Args>
209201
void operator()(F f, Args&&... args) {
210-
kul::gpu::sync();
211-
std::apply([&](auto&&... params) {
212-
f<<<g, b, ds, s>>>(params...);
213-
}, devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
202+
sync();
203+
std::apply([&](auto&&... params) { f<<<g, b, ds, s>>>(params...); },
204+
devmem_replace(std::forward_as_tuple(args...),
205+
std::make_index_sequence<sizeof...(Args)>()));
214206
}
215207
size_t ds = 0 /*dynamicShared*/;
216208
dim3 g /*gridDim*/, b /*blockDim*/;
217209
cudaStream_t s = 0;
218210
};
219211

220212
template <typename T, typename V>
221-
void fill_n(kul::gpu::DeviceMem<T>& p, size_t size, V val) {
213+
void fill_n(DeviceMem<T>& p, size_t size, V val) {
222214
p.fill_n(val, size);
223215
}
224216

225217
template <typename T, typename V>
226-
void fill_n(kul::gpu::DeviceMem<T>&& p, size_t size, V val) {
218+
void fill_n(DeviceMem<T>&& p, size_t size, V val) {
227219
fill_n(p, size, val);
228220
}
229221

222+
#if defined(KUL_GPU_FN_PER_NS) && KUL_GPU_FN_PER_NS
223+
} /* namespace cuda */
224+
#endif // KUL_GPU_FN_PER_NS
230225
} /* namespace kul::gpu */
231226

232227
#undef KUL_GPU_ASSERT

inc/kul/gpu/cuda/def.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#ifndef _KUL_GPU_CUDA_DEF_HPP_
44
#define _KUL_GPU_CUDA_DEF_HPP_
55

6+
#include <cuda_runtime.h>
7+
68
namespace kul::gpu::cuda {
79

810
__device__ uint32_t idx() {

inc/kul/gpu/def.hpp

+4-14
Original file line numberDiff line numberDiff line change
@@ -3,32 +3,22 @@
33
#ifndef _KUL_GPU_DEF_HPP_
44
#define _KUL_GPU_DEF_HPP_
55

6+
#include <type_traits>
67

78
#if defined(KUL_GPU_ROCM)
8-
#include "kul/gpu/rocm.hpp"
9+
#include "kul/gpu/rocm/def.hpp"
910
#elif defined(KUL_GPU_CUDA)
10-
#include "kul/gpu/cuda.hpp"
11-
#else
11+
#include "kul/gpu/cuda/def.hpp"
12+
#elif !defined(KUL_GPU_FN_PER_NS) || KUL_GPU_FN_PER_NS == 0
1213
#error "UNKNOWN GPU / define KUL_GPU_ROCM or KUL_GPU_CUDA"
1314
#endif
1415

15-
1616
namespace kul::gpu {
1717

1818
template <typename T>
1919
static constexpr bool is_floating_point_v =
2020
std::is_floating_point_v<T> or std::is_same_v<_Float16, T>;
2121

22-
__device__ uint32_t idx() {
23-
#if defined(KUL_GPU_ROCM)
24-
return kul::gpu::hip::idx();
25-
#elif defined(KUL_GPU_CUDA)
26-
return kul::gpu::cuda::idx();
27-
#else
28-
#error "UNKNOWN GPU / define KUL_GPU_ROCM or KUL_GPU_CUDA"
29-
#endif
30-
}
31-
3222
} /* namespace kul::gpu */
3323

3424
#endif /*_KUL_GPU_DEF_HPP_*/

inc/kul/gpu/rocm.hpp

+22-27
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3737
#include "kul/assert.hpp"
3838
#include "kul/tuple.hpp"
3939

40-
#include "kul/gpu/rocm/def.hpp"
40+
#include "kul/gpu/def.hpp"
4141

4242
#define KUL_GPU_ASSERT(x) (KASSERT((x) == hipSuccess))
4343

4444
namespace kul::gpu {
45+
#if defined(KUL_GPU_FN_PER_NS) && KUL_GPU_FN_PER_NS
46+
namespace hip {
47+
#endif // KUL_GPU_FN_PER_NS
4548

4649
// https://rocm-developer-tools.github.io/HIP/group__Device.html
4750
void prinfo(size_t dev = 0) {
@@ -59,23 +62,17 @@ void prinfo(size_t dev = 0) {
5962

6063
template <typename T, typename SIZE = uint32_t>
6164
struct DeviceMem {
62-
using Span = kul::Span<T, SIZE>;
63-
using Span_ct = kul::Span<T const, SIZE>;
6465

6566
DeviceMem() {}
6667
DeviceMem(SIZE _s) : s{_s}, owned{true} {
6768
SIZE alloc_bytes = s * sizeof(T);
6869
KLOG(OTH) << "GPU alloced: " << alloc_bytes;
69-
KUL_GPU_ASSERT(hipMalloc((void**)&p, alloc_bytes));
70+
if (s) KUL_GPU_ASSERT(hipMalloc((void**)&p, alloc_bytes));
7071
}
7172

7273
DeviceMem(T const* const t, SIZE _s) : DeviceMem{_s} { send(t, _s); }
73-
DeviceMem(Span const& s) : DeviceMem{s.data(), s.size()} {}
74-
DeviceMem(Span&& s) : DeviceMem{s} {}
75-
DeviceMem(Span_ct const& s) : DeviceMem{s.data(), s.size()} {}
76-
DeviceMem(Span_ct&& s) : DeviceMem{s} {}
77-
DeviceMem(std::vector<T> const& v) : DeviceMem{&v[0], static_cast<SIZE>(v.size())} {}
78-
DeviceMem(std::vector<T>&& v) : DeviceMem{v} {}
74+
template <typename C, std::enable_if_t<kul::is_span_like_v<C>, bool> = 0>
75+
DeviceMem(C c) : DeviceMem{c.data(), static_cast<SIZE>(c.size())} {}
7976

8077
~DeviceMem() {
8178
if (p && s && owned) KUL_GPU_ASSERT(hipFree(p));
@@ -84,23 +81,18 @@ struct DeviceMem {
8481
void send(T const* const t, SIZE _size = 1, SIZE start = 0) {
8582
KUL_GPU_ASSERT(hipMemcpy(p + start, t, _size * sizeof(T), hipMemcpyHostToDevice));
8683
}
87-
88-
void send(Span const& s, SIZE start = 0) { send(s.data(), s.size(), start); }
89-
void send(Span&& s, SIZE start = 0) { send(s, start); }
90-
91-
void send(Span_ct const& s, SIZE start = 0) { send(s.data(), s.size(), start); }
92-
void send(Span_ct&& s, SIZE start = 0) { send(s, start); }
93-
94-
void send(std::vector<T> const& v, SIZE start = 0) { send(&v[0], v.size(), start); }
95-
void send(std::vector<T>&& v, SIZE start = 0) { send(v, start); }
84+
template <typename C, std::enable_if_t<kul::is_span_like_v<C>, bool> = 0>
85+
void send(C c, SIZE start = 0) {
86+
send(c.data(), c.size(), start);
87+
}
9688

9789
void fill_n(T t, SIZE _size, SIZE start = 0) {
9890
// TODO - improve with memSet style
9991
assert(_size + start <= s);
10092
send(std::vector<T>(_size, t), start);
10193
}
10294

103-
decltype(auto) operator+(size_t size) {
95+
DeviceMem<T> operator+(size_t size) {
10496
DeviceMem<T> view;
10597
view.p = this->p + size;
10698
view.s = this->s - size;
@@ -175,7 +167,7 @@ struct ADeviceClass<false> {
175167
template <bool GPU>
176168
struct DeviceClass : ADeviceClass<GPU> {
177169
template <typename T, typename SIZE = uint32_t>
178-
using container_t = std::conditional_t<GPU, T*, kul::gpu::DeviceMem<T, SIZE>>;
170+
using container_t = std::conditional_t<GPU, T*, DeviceMem<T, SIZE>>;
179171
};
180172

181173
namespace {
@@ -207,26 +199,29 @@ struct Launcher {
207199

208200
template <typename F, typename... Args>
209201
void operator()(F f, Args&&... args) {
210-
kul::gpu::sync();
211-
std::apply([&](auto&&... params) {
212-
hipLaunchKernelGGL(f, g, b, ds, s, params...);
213-
}, devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
202+
sync();
203+
std::apply([&](auto&&... params) { hipLaunchKernelGGL(f, g, b, ds, s, params...); },
204+
devmem_replace(std::forward_as_tuple(args...),
205+
std::make_index_sequence<sizeof...(Args)>()));
214206
}
215207
size_t ds = 0 /*dynamicShared*/;
216208
dim3 g /*gridDim*/, b /*blockDim*/;
217209
hipStream_t s = 0;
218210
};
219211

220212
template <typename T, typename V>
221-
void fill_n(kul::gpu::DeviceMem<T>& p, size_t size, V val) {
213+
void fill_n(DeviceMem<T>& p, size_t size, V val) {
222214
p.fill_n(val, size);
223215
}
224216

225217
template <typename T, typename V>
226-
void fill_n(kul::gpu::DeviceMem<T>&& p, size_t size, V val) {
218+
void fill_n(DeviceMem<T>&& p, size_t size, V val) {
227219
fill_n(p, size, val);
228220
}
229221

222+
#if defined(KUL_GPU_FN_PER_NS) && KUL_GPU_FN_PER_NS
223+
} /* namespace hip */
224+
#endif // KUL_GPU_FN_PER_NS
230225
} /* namespace kul::gpu */
231226

232227
#undef KUL_GPU_ASSERT

inc/kul/gpu/rocm/def.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#ifndef _KUL_GPU_ROCM_DEF_HPP_
44
#define _KUL_GPU_ROCM_DEF_HPP_
55

6+
#include "hip/hip_runtime.h"
7+
68
namespace kul::gpu::hip {
79

810
__device__ uint32_t idx() {

mkn.yaml

+6-6
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@ profile:
88
inc: inc
99
dep: mkn.kul
1010

11-
- name: test
12-
parent: headers
13-
test: test/(\w).cpp
14-
1511
- name: rocm
16-
parent: test
12+
parent: headers
1713
arg: -DKUL_GPU_ROCM
14+
test: test/any/(\w).cpp
15+
test/hip/(\w).cpp
1816

1917
- name: cuda
20-
parent: test
18+
parent: headers
2119
arg: -DKUL_GPU_CUDA
20+
test: test/any/(\w).cpp
21+
test/cuda/(\w).cpp

test/add.cpp renamed to test/any/add.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,6 @@ size_t test(){
2626
}
2727

2828
int main() {
29-
kul::gpu::prinfo();
29+
KOUT(NON) << __FILE__;
3030
return test<float>() + test<double>();
3131
}

test/class.cpp renamed to test/any/class.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,6 @@ size_t test(){
6464
}
6565

6666
int main() {
67-
kul::gpu::prinfo();
67+
KOUT(NON) << __FILE__;
6868
return test<float>() + test<double>();
6969
}

test/any/info.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
#include "kul/gpu.hpp"
3+
4+
int main() {
5+
KOUT(NON) << __FILE__;
6+
kul::gpu::prinfo();
7+
return 0;
8+
}

0 commit comments

Comments
 (0)