Skip to content

Commit 8747d99

Browse files
authored
Merge pull request #1 from mkn/pr
gpu func object test/example
2 parents b64b902 + 6472485 commit 8747d99

File tree

7 files changed

+128
-30
lines changed

7 files changed

+128
-30
lines changed

inc/kul/gpu/cuda.hpp

+7-11
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4343

4444
namespace kul::gpu {
4545

46-
template <typename T>
47-
static constexpr bool is_floating_point_v =
48-
std::is_floating_point_v<T> or std::is_same_v<_Float16, T>;
49-
5046
//
5147
void prinfo(size_t dev = 0) {
5248
cudaDeviceProp devProp;
@@ -185,16 +181,16 @@ struct DeviceClass : ADeviceClass<GPU> {
185181
namespace {
186182

187183
template <typename T>
188-
decltype(auto) get(T const& t) {
184+
decltype(auto) replace(T& t) {
189185
if constexpr (is_device_mem_v<T>)
190186
return t.p;
191187
else
192188
return t;
193189
}
194190

195191
template <std::size_t... I, typename... Args>
196-
decltype(auto) devmem_replace(std::tuple<Args const&...>&& tup, std::index_sequence<I...>) {
197-
return std::make_tuple(get(std::get<I>(tup))...);
192+
decltype(auto) devmem_replace(std::tuple<Args&...>&& tup, std::index_sequence<I...>) {
193+
return std::make_tuple(replace(std::get<I>(tup))...);
198194
}
199195

200196
} /* namespace */
@@ -210,11 +206,11 @@ struct Launcher {
210206
: Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {}
211207

212208
template <typename F, typename... Args>
213-
void operator()(F f, Args const&... args) {
209+
void operator()(F f, Args&&... args) {
214210
kul::gpu::sync();
215-
auto tup =
216-
devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>());
217-
std::apply([&](auto&... params) { f<<<g, b, ds, s>>>(params...); }, tup);
211+
std::apply([&](auto&&... params) {
212+
f<<<g, b, ds, s>>>(params...);
213+
}, devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
218214
}
219215
size_t ds = 0 /*dynamicShared*/;
220216
dim3 g /*gridDim*/, b /*blockDim*/;

inc/kul/gpu/def.hpp

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
2+
3+
#ifndef _KUL_GPU_DEF_HPP_
4+
#define _KUL_GPU_DEF_HPP_
5+
6+
7+
#if defined(KUL_GPU_ROCM)
8+
#include "kul/gpu/rocm.hpp"
9+
#elif defined(KUL_GPU_CUDA)
10+
#include "kul/gpu/cuda.hpp"
11+
#else
12+
#error "UNKNOWN GPU / define KUL_GPU_ROCM or KUL_GPU_CUDA"
13+
#endif
14+
15+
16+
namespace kul::gpu {
17+
18+
template <typename T>
19+
static constexpr bool is_floating_point_v =
20+
std::is_floating_point_v<T> or std::is_same_v<_Float16, T>;
21+
22+
__device__ uint32_t idx() {
23+
#if defined(KUL_GPU_ROCM)
24+
return kul::gpu::hip::idx();
25+
#elif defined(KUL_GPU_CUDA)
26+
return kul::gpu::cuda::idx();
27+
#else
28+
#error "UNKNOWN GPU / define KUL_GPU_ROCM or KUL_GPU_CUDA"
29+
#endif
30+
}
31+
32+
} /* namespace kul::gpu */
33+
34+
#endif /*_KUL_GPU_DEF_HPP_*/

inc/kul/gpu/rocm.hpp

+6-10
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4343

4444
namespace kul::gpu {
4545

46-
template <typename T>
47-
static constexpr bool is_floating_point_v =
48-
std::is_floating_point_v<T> or std::is_same_v<_Float16, T>;
49-
5046
// https://rocm-developer-tools.github.io/HIP/group__Device.html
5147
void prinfo(size_t dev = 0) {
5248
hipDeviceProp_t devProp;
@@ -185,15 +181,15 @@ struct DeviceClass : ADeviceClass<GPU> {
185181
namespace {
186182

187183
template <typename T>
188-
decltype(auto) replace(T const& t) {
184+
decltype(auto) replace(T& t) {
189185
if constexpr (is_device_mem_v<T>)
190186
return t.p;
191187
else
192188
return t;
193189
}
194190

195191
template <std::size_t... IS, typename... Args>
196-
decltype(auto) devmem_replace(std::tuple<Args const&...>&& tup, std::index_sequence<IS...>) {
192+
decltype(auto) devmem_replace(std::tuple<Args&...>&& tup, std::index_sequence<IS...>) {
197193
return std::make_tuple(replace(std::get<IS>(tup))...);
198194
}
199195

@@ -210,11 +206,11 @@ struct Launcher {
210206
: Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {}
211207

212208
template <typename F, typename... Args>
213-
void operator()(F f, Args const&... args) {
209+
void operator()(F f, Args&&... args) {
214210
kul::gpu::sync();
215-
auto tup =
216-
devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>());
217-
std::apply([&](auto&... params) { hipLaunchKernelGGL(f, g, b, ds, s, params...); }, tup);
211+
std::apply([&](auto&&... params) {
212+
hipLaunchKernelGGL(f, g, b, ds, s, params...);
213+
}, devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
218214
}
219215
size_t ds = 0 /*dynamicShared*/;
220216
dim3 g /*gridDim*/, b /*blockDim*/;

inc/kul/gpu/tuple.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ struct SpanSet : ASpanSet<T, SIZE, GPU> {
9797
}
9898

9999
template <bool gpu = GPU, std::enable_if_t<!gpu, bool> = 0>
100-
decltype(auto) take() {
100+
kul::SpanSet<T, SIZE>& take() {
101101
Super::base.vec = std::move(vec.take());
102102
return Super::base;
103103
}

mkn.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ profile:
1010

1111
- name: test
1212
parent: headers
13-
test: test/gpu.cpp
13+
test: test/(\w).cpp
1414

1515
- name: rocm
1616
parent: test
+10-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11

2-
32
#include "kul/gpu.hpp"
43

54
static constexpr size_t WIDTH = 1024, HEIGHT = 1024;
@@ -12,17 +11,21 @@ __global__ void vectoradd(T* a, const T* b, const T* c) {
1211
a[i] = b[i] + c[i];
1312
}
1413

15-
int main() {
16-
kul::gpu::prinfo();
17-
std::vector<float> hostB(NUM), hostC(NUM);
14+
template<typename Float>
15+
size_t test(){
16+
std::vector<Float> hostB(NUM), hostC(NUM);
1817
for (size_t i = 0; i < NUM; i++) hostB[i] = i;
1918
for (size_t i = 0; i < NUM; i++) hostC[i] = i * 100.0f;
20-
kul::gpu::DeviceMem<float> devA(NUM), devB(hostB), devC(hostC);
19+
kul::gpu::DeviceMem<Float> devA(NUM), devB(hostB), devC(hostC);
2120
kul::gpu::Launcher{WIDTH, HEIGHT, THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y}(
22-
vectoradd<float>, devA, devB, devC);
21+
vectoradd<Float>, devA, devB, devC);
2322
auto hostA = devA();
2423
for (size_t i = 0; i < NUM; i++)
2524
if (hostA[i] != (hostB[i] + hostC[i])) return 1;
26-
printf("PASSED!\n");
2725
return 0;
2826
}
27+
28+
int main() {
29+
kul::gpu::prinfo();
30+
return test<float>() + test<double>();
31+
}

test/class.cpp

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
2+
#include "kul/gpu.hpp"
3+
4+
static constexpr size_t WIDTH = 1024, HEIGHT = 1024;
5+
static constexpr size_t NUM = WIDTH * HEIGHT;
6+
static constexpr size_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16;
7+
8+
template<typename Float, bool GPU = false>
9+
struct DevClass : kul::gpu::DeviceClass<GPU>
10+
{
11+
using Super = kul::gpu::DeviceClass<GPU>;
12+
using gpu_t = DevClass<Float, true>;
13+
14+
template<typename T>
15+
using container_t = typename Super::template container_t<T>;
16+
17+
template<bool gpu = GPU, std::enable_if_t<!gpu, bool> = 0>
18+
DevClass(std::uint32_t nbr)
19+
: data{nbr}
20+
{
21+
}
22+
23+
template<bool gpu = GPU, std::enable_if_t<!gpu, bool> = 0>
24+
DevClass(std::vector<Float> const& in)
25+
: data{in}
26+
{
27+
}
28+
29+
template<bool gpu = GPU, std::enable_if_t<!gpu, bool> = 0>
30+
auto operator()()
31+
{
32+
return Super::template alloc<gpu_t>(data);
33+
}
34+
35+
template<bool gpu = GPU, std::enable_if_t<gpu, bool> = 0>
36+
auto& operator[](int i) __device__ { return data[i]; }
37+
template<bool gpu = GPU, std::enable_if_t<gpu, bool> = 0>
38+
auto const& operator[](int i) const __device__ { return data[i]; }
39+
40+
container_t<Float> data;
41+
};
42+
43+
template <typename T>
44+
using GPUClass = typename ::DevClass<T>::gpu_t;
45+
46+
template <typename T>
47+
__global__ void vectoradd(GPUClass<T>* a, GPUClass<T> const* b, GPUClass<T> const* c) {
48+
int i = kul::gpu::idx();
49+
(*a)[i] = (*b)[i] + (*c)[i];
50+
}
51+
52+
template<typename Float>
53+
size_t test(){
54+
std::vector<Float> hostB(NUM), hostC(NUM);
55+
for (size_t i = 0; i < NUM; i++) hostB[i] = i;
56+
for (size_t i = 0; i < NUM; i++) hostC[i] = i * 100.0f;
57+
DevClass<Float> devA(NUM), devB(hostB), devC(hostC);
58+
kul::gpu::Launcher{WIDTH, HEIGHT, THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y}(
59+
vectoradd<Float>, devA(), devB(), devC());
60+
auto hostA = devA.data();
61+
for (size_t i = 0; i < NUM; i++)
62+
if (hostA[i] != (hostB[i] + hostC[i])) return 1;
63+
return 0;
64+
}
65+
66+
int main() {
67+
kul::gpu::prinfo();
68+
return test<float>() + test<double>();
69+
}

0 commit comments

Comments
 (0)