@@ -37,11 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37
37
#include " kul/assert.hpp"
38
38
#include " kul/tuple.hpp"
39
39
40
- #include " kul/gpu/rocm/ def.hpp"
40
+ #include " kul/gpu/def.hpp"
41
41
42
42
#define KUL_GPU_ASSERT (x ) (KASSERT((x) == hipSuccess))
43
43
44
44
namespace kul ::gpu {
45
+ #if defined(KUL_GPU_FN_PER_NS) && KUL_GPU_FN_PER_NS
46
+ namespace hip {
47
+ #endif // KUL_GPU_FN_PER_NS
45
48
46
49
// https://rocm-developer-tools.github.io/HIP/group__Device.html
47
50
void prinfo (size_t dev = 0 ) {
@@ -59,23 +62,17 @@ void prinfo(size_t dev = 0) {
59
62
60
63
template <typename T, typename SIZE = uint32_t >
61
64
struct DeviceMem {
62
- using Span = kul::Span<T, SIZE>;
63
- using Span_ct = kul::Span<T const , SIZE>;
64
65
65
66
DeviceMem () {}
66
67
DeviceMem (SIZE _s) : s{_s}, owned{true } {
67
68
SIZE alloc_bytes = s * sizeof (T);
68
69
KLOG (OTH) << " GPU alloced: " << alloc_bytes;
69
- KUL_GPU_ASSERT (hipMalloc ((void **)&p, alloc_bytes));
70
+ if (s) KUL_GPU_ASSERT (hipMalloc ((void **)&p, alloc_bytes));
70
71
}
71
72
72
73
DeviceMem (T const * const t, SIZE _s) : DeviceMem{_s} { send (t, _s); }
73
- DeviceMem (Span const & s) : DeviceMem{s.data (), s.size ()} {}
74
- DeviceMem (Span&& s) : DeviceMem{s} {}
75
- DeviceMem (Span_ct const & s) : DeviceMem{s.data (), s.size ()} {}
76
- DeviceMem (Span_ct&& s) : DeviceMem{s} {}
77
- DeviceMem (std::vector<T> const & v) : DeviceMem{&v[0 ], static_cast <SIZE>(v.size ())} {}
78
- DeviceMem (std::vector<T>&& v) : DeviceMem{v} {}
74
+ template <typename C, std::enable_if_t <kul::is_span_like_v<C>, bool > = 0 >
75
+ DeviceMem (C c) : DeviceMem{c.data (), static_cast <SIZE>(c.size ())} {}
79
76
80
77
~DeviceMem () {
81
78
if (p && s && owned) KUL_GPU_ASSERT (hipFree (p));
@@ -84,23 +81,18 @@ struct DeviceMem {
84
81
void send (T const * const t, SIZE _size = 1 , SIZE start = 0 ) {
85
82
KUL_GPU_ASSERT (hipMemcpy (p + start, t, _size * sizeof (T), hipMemcpyHostToDevice));
86
83
}
87
-
88
- void send (Span const & s, SIZE start = 0 ) { send (s.data (), s.size (), start); }
89
- void send (Span&& s, SIZE start = 0 ) { send (s, start); }
90
-
91
- void send (Span_ct const & s, SIZE start = 0 ) { send (s.data (), s.size (), start); }
92
- void send (Span_ct&& s, SIZE start = 0 ) { send (s, start); }
93
-
94
- void send (std::vector<T> const & v, SIZE start = 0 ) { send (&v[0 ], v.size (), start); }
95
- void send (std::vector<T>&& v, SIZE start = 0 ) { send (v, start); }
84
+ template <typename C, std::enable_if_t <kul::is_span_like_v<C>, bool > = 0 >
85
+ void send (C c, SIZE start = 0 ) {
86
+ send (c.data (), c.size (), start);
87
+ }
96
88
97
89
void fill_n (T t, SIZE _size, SIZE start = 0 ) {
98
90
// TODO - improve with memSet style
99
91
assert (_size + start <= s);
100
92
send (std::vector<T>(_size, t), start);
101
93
}
102
94
103
- decltype ( auto ) operator +(size_t size) {
95
+ DeviceMem<T> operator +(size_t size) {
104
96
DeviceMem<T> view;
105
97
view.p = this ->p + size;
106
98
view.s = this ->s - size;
@@ -175,7 +167,7 @@ struct ADeviceClass<false> {
175
167
template <bool GPU>
176
168
struct DeviceClass : ADeviceClass<GPU> {
177
169
template <typename T, typename SIZE = uint32_t >
178
- using container_t = std::conditional_t <GPU, T*, kul::gpu:: DeviceMem<T, SIZE>>;
170
+ using container_t = std::conditional_t <GPU, T*, DeviceMem<T, SIZE>>;
179
171
};
180
172
181
173
namespace {
@@ -207,26 +199,29 @@ struct Launcher {
207
199
208
200
template <typename F, typename ... Args>
209
201
void operator ()(F f, Args&&... args) {
210
- kul::gpu:: sync ();
211
- std::apply ([&](auto &&... params) {
212
- hipLaunchKernelGGL (f, g, b, ds, s, params ...);
213
- }, devmem_replace ( std::forward_as_tuple (args...), std::make_index_sequence<sizeof ...(Args)>()));
202
+ sync ();
203
+ std::apply ([&](auto &&... params) { hipLaunchKernelGGL (f, g, b, ds, s, params...); },
204
+ devmem_replace ( std::forward_as_tuple (args ...),
205
+ std::make_index_sequence<sizeof ...(Args)>()));
214
206
}
215
207
size_t ds = 0 /* dynamicShared*/ ;
216
208
dim3 g /* gridDim*/ , b /* blockDim*/ ;
217
209
hipStream_t s = 0 ;
218
210
};
219
211
220
212
template <typename T, typename V>
221
- void fill_n (kul::gpu:: DeviceMem<T>& p, size_t size, V val) {
213
+ void fill_n (DeviceMem<T>& p, size_t size, V val) {
222
214
p.fill_n (val, size);
223
215
}
224
216
225
217
template <typename T, typename V>
226
- void fill_n (kul::gpu:: DeviceMem<T>&& p, size_t size, V val) {
218
+ void fill_n (DeviceMem<T>&& p, size_t size, V val) {
227
219
fill_n (p, size, val);
228
220
}
229
221
222
+ #if defined(KUL_GPU_FN_PER_NS) && KUL_GPU_FN_PER_NS
223
+ } /* namespace hip */
224
+ #endif // KUL_GPU_FN_PER_NS
230
225
} /* namespace kul::gpu */
231
226
232
227
#undef KUL_GPU_ASSERT
0 commit comments