Skip to content

Commit e13cf8b

Browse files
authored
Merge pull request #2 from NetroScript/development
Merge the current development branch into the master to have a more up to date master for the UI branch also.
2 parents 2e3fa15 + 8273eb1 commit e13cf8b

File tree

11 files changed

+1268
-303
lines changed

11 files changed

+1268
-303
lines changed

examples/basic/CMakeLists.txt

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,24 @@ endif()
1818
# add include path to the cuda_mav.cuh header
1919
target_link_libraries(${_TARGET_NORMAL} PRIVATE CUDA_MAV_HEADER)
2020

21-
set(_TARGET_ANNOTATED annotated)
21+
set(_TARGET_RANDOM_2D_MATRIX random2dMatrix)
2222

23-
add_executable(${_TARGET_ANNOTATED})
24-
target_sources(${_TARGET_ANNOTATED}
23+
add_executable(${_TARGET_RANDOM_2D_MATRIX})
24+
target_sources(${_TARGET_RANDOM_2D_MATRIX}
2525
PRIVATE
26-
annotated.cu)
27-
set_target_properties(${_TARGET_ANNOTATED} PROPERTIES
26+
random2dMatrix.cu)
27+
set_target_properties(${_TARGET_RANDOM_2D_MATRIX} PROPERTIES
28+
CUDA_CXX_STANDARD 17
29+
)
30+
31+
set(_TARGET_REDUCE reduce)
32+
33+
add_executable(${_TARGET_REDUCE})
34+
target_sources(${_TARGET_REDUCE}
35+
PRIVATE
36+
reduce.cu)
37+
set_target_properties(${_TARGET_REDUCE} PROPERTIES
38+
CXX_STANDARD 17
2839
CUDA_CXX_STANDARD 17
2940
)
3041
if(ENABLE_CUDA_DEVICE_DEBUG)

examples/basic/normal.cu

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ inline void checkCudaError(cudaError_t err) {
1212
}
1313

1414

15-
__global__ void kernel(int prob_size, CudaMav<int> * input, CudaMav<int> * output){
15+
__global__ void kernel(int prob_size, CudaMemAccessLogger<int> * input, CudaMemAccessLogger<int> * output){
1616
int id = threadIdx.x + blockIdx.x * blockDim.x;
1717
if (id < prob_size) {
1818
(*output)[id] = (*input)[id];
@@ -21,7 +21,7 @@ __global__ void kernel(int prob_size, CudaMav<int> * input, CudaMav<int> * outpu
2121
}
2222

2323
int main(){
24-
constexpr int prob_size = 100;
24+
constexpr int prob_size = 1000;
2525

2626
std::vector<int> h_input(prob_size);
2727
std::iota(h_input.begin(), h_input.end(), 0);
@@ -34,20 +34,23 @@ int main(){
3434

3535
checkCudaError(cudaMemcpy(d_input, h_input.data(), sizeof(int)* prob_size, cudaMemcpyHostToDevice));
3636

37-
CudaMav<int> input(d_input);
38-
CudaMav<int> output(d_output);
37+
// The overloaded new operator generates a managed memory object
38+
CudaMemAccessStorage<int>* memAccessStorage = new CudaMemAccessStorage<int>(10000);
39+
40+
// The overloaded new operator generates a managed memory object
41+
CudaMemAccessLogger<int>* input = new CudaMemAccessLogger<int>(d_input, prob_size, "Input Datastructure", memAccessStorage);
42+
CudaMemAccessLogger<int>* output = new CudaMemAccessLogger<int>(d_output, prob_size, "Output Datastructure", memAccessStorage);
3943

4044
constexpr int threads = 32;
4145
constexpr int blocks = (prob_size/threads)+1;
4246

43-
kernel<<<blocks, threads>>>(prob_size, input.getDevicePointer(), output.getDevicePointer());
47+
//kernel<<<blocks, threads>>>(prob_size, input.getDevicePointer(), output.getDevicePointer());
48+
kernel << <blocks, threads >> > (prob_size, input, output);
4449
checkCudaError(cudaGetLastError());
4550
cudaDeviceSynchronize();
4651

47-
auto data = input.getGlobalSettings();
48-
49-
input.analyze("../../../html/basic_template.html", "../../../out/basic_input.html");
50-
output.analyze("../../../html/basic_template.html", "../../../out/basic_output.html");
52+
memAccessStorage->generateTemplatedOutput("../../../templates/basic_template.html", "../../../out/basic_html.html",
53+
CudaMemAccessStorage<int>::parseDataForStaticHTML);
5154

5255
checkCudaError(cudaMemcpy(h_output.data(), d_output, sizeof(int)*prob_size, cudaMemcpyDeviceToHost));
5356

@@ -61,6 +64,11 @@ int main(){
6164
checkCudaError(cudaFree(d_input));
6265
checkCudaError(cudaFree(d_output));
6366

67+
// Free up the managed memory objects
68+
delete memAccessStorage;
69+
delete input;
70+
delete output;
71+
6472
std::cout << "kernel finished successful" << std::endl;
6573
return 0;
6674
}

examples/basic/normal_original.cu

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#include <vector>
2+
#include <numeric>
3+
#include <iostream>
4+
5+
inline void checkCudaError(cudaError_t err) {
6+
if (err != cudaSuccess) {
7+
std::cerr << "\rCuda Error " << err << ": " << cudaGetErrorString(err) << std::endl;
8+
std::cerr << "Aborting..." << std::endl;
9+
exit(1);
10+
}
11+
}
12+
13+
14+
__global__ void kernel(int prob_size, int * const input, int * output){
15+
int id = threadIdx.x + blockIdx.x * blockDim.x;
16+
if(id < prob_size){
17+
output[id] = input[id];
18+
}
19+
}
20+
21+
int main(){
22+
constexpr int prob_size = 100;
23+
24+
std::vector<int> h_input(prob_size);
25+
std::iota(h_input.begin(), h_input.end(), 0);
26+
int * d_input = nullptr;
27+
checkCudaError(cudaMalloc((void**) &d_input, sizeof(int)*prob_size));
28+
29+
std::vector<int> h_output(prob_size, 0);
30+
int * d_output = nullptr;
31+
checkCudaError(cudaMalloc((void**) &d_output, sizeof(int)*prob_size));
32+
33+
checkCudaError(cudaMemcpy(d_input, h_input.data(), sizeof(int)* prob_size, cudaMemcpyHostToDevice));
34+
35+
constexpr int threads = 32;
36+
constexpr int blocks = (prob_size/threads)+1;
37+
38+
kernel<<<blocks, threads>>>(prob_size, d_input, d_output);
39+
checkCudaError(cudaGetLastError());
40+
41+
checkCudaError(cudaMemcpy(h_output.data(), d_output, sizeof(int)*prob_size, cudaMemcpyDeviceToHost));
42+
43+
for(auto i = 0; i < h_input.size(); ++i){
44+
if(h_input[i] != h_output[i]){
45+
std::cerr << "Element at position " << i << "is not equal (input - output): " << h_input[i] << " != " << h_output[i] << std::endl;
46+
std::exit(1);
47+
}
48+
}
49+
50+
checkCudaError(cudaFree(d_input));
51+
checkCudaError(cudaFree(d_output));
52+
53+
std::cout << "kernel finished successful" << std::endl;
54+
return 0;
55+
}

examples/basic/random2dMatrix.cu

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
// The applicaiton creates a 2D matrix and initialize each element randomly with a value between 0 and 10.
2+
// The kernel is simply decrementing each element until 0 in a very ineffective way.
3+
4+
#include <iostream>
5+
#include <vector>
6+
#include <random>
7+
#include <algorithm>
8+
#include "../../include/cuda_mav.cuh"
9+
10+
// The wrapper macro is required, that __LINE__ is correct pointing to the line, where the check fails
11+
#define checkCudaError(ans) \
12+
{ \
13+
checkCudaErrorFunc((ans), __FILE__, __LINE__); \
14+
}
15+
16+
inline void checkCudaErrorFunc(cudaError_t err, const char *file, int line)
17+
{
18+
if (err != cudaSuccess)
19+
{
20+
std::cout << "\r" << file << ":" << line << " -> Cuda Error " << err << ": " << cudaGetErrorString(err) << std::endl;
21+
std::cout << "Aborting..." << std::endl;
22+
exit(0);
23+
}
24+
}
25+
26+
__global__ void decrement(unsigned int const size, CudaMemAccessLogger<unsigned int> *data, CudaMemAccessLogger<unsigned int> *control)
27+
{
28+
int index = threadIdx.x + blockIdx.x * blockDim.x;
29+
int stride = blockDim.x * gridDim.x;
30+
31+
for (int i = index; i < size; i += stride)
32+
{
33+
while ((*data)[i] > 0)
34+
{
35+
(*data)[i] = (*data)[i] - 1;
36+
(*control)[i] = (*control)[i] + 1;
37+
}
38+
}
39+
}
40+
41+
/// @brief Increment all values in a specific area by the value of increment. The maximum value of an entry is clamp to 10.
42+
/// @param data Data to increment.
43+
/// @param dim Dimensions of the 2D matrix.
44+
/// @param y_start Y start coordinate of the area to increment.
45+
/// @param x_start X start coordinate of the area to increment.
46+
/// @param size Size of the Y and X direction of the area to increment.
47+
/// @param increment Value to increment.
48+
void hot_spot(std::vector<unsigned int> &data, unsigned int const dim, unsigned int const y_start, unsigned int const x_start, unsigned int const size, unsigned int const increment)
49+
{
50+
for (auto y = y_start; y < y_start + size; ++y)
51+
{
52+
for (auto x = x_start; x < x_start + size; ++x)
53+
{
54+
if (data[y * dim + x] + increment > 10)
55+
{
56+
data[y * dim + x] = 10;
57+
}
58+
else
59+
{
60+
data[y * dim + x] += increment;
61+
}
62+
}
63+
}
64+
}
65+
66+
int main(int argc, char **argv)
67+
{
68+
unsigned int dim = 100;
69+
70+
std::vector<unsigned int> h_data(dim * dim);
71+
// create a 2D matrix where all elements are 0
72+
std::vector<unsigned int> h_control(dim * dim, 0);
73+
74+
// initialize data matrix with random numbers betweem 0 and 10
75+
std::uniform_int_distribution<unsigned int> distribution(
76+
0,
77+
10);
78+
std::default_random_engine generator;
79+
std::generate(
80+
h_data.begin(),
81+
h_data.end(),
82+
[&distribution, &generator]()
83+
{ return distribution(generator); });
84+
85+
// enable me, to create a hot spot area
86+
// the hot spot area should looks interesting in the memory access visualization
87+
if (true)
88+
{
89+
hot_spot(h_data, dim, 8, 10, 10, 20);
90+
}
91+
92+
// enable me, to print the matrix
93+
if (false)
94+
{
95+
for (auto y = 0; y < dim; ++y)
96+
{
97+
for (auto x = 0; x < dim; ++x)
98+
{
99+
if (h_data[y * dim + x] < 10)
100+
{
101+
std::cout << " " << h_data[y * dim + x] << " ";
102+
}
103+
else
104+
{
105+
std::cout << h_data[y * dim + x] << " ";
106+
}
107+
}
108+
std::cout << std::endl;
109+
}
110+
}
111+
112+
unsigned int *d_data = nullptr;
113+
unsigned int *d_control = nullptr;
114+
115+
size_t const buffer_size_byte = sizeof(unsigned int) * dim * dim;
116+
117+
checkCudaError(cudaMalloc((void **)&d_data, buffer_size_byte));
118+
checkCudaError(cudaMalloc((void **)&d_control, buffer_size_byte));
119+
120+
checkCudaError(cudaMemcpy(d_data, h_data.data(), buffer_size_byte, cudaMemcpyHostToDevice));
121+
// copy h_controll to initialize all values with 0 on the GPU
122+
checkCudaError(cudaMemcpy(d_control, h_control.data(), buffer_size_byte, cudaMemcpyHostToDevice));
123+
124+
// Define amount of accesses you want to log and create a memory object which stores them
125+
auto* memAccessStorage = new CudaMemAccessStorage<unsigned int>(dim * dim * 50);
126+
127+
// Wrap the data classes with the custom logging class
128+
auto* data = new CudaMemAccessLogger<unsigned int>(d_data, dim*dim, "Decremented Data", memAccessStorage);
129+
auto* control = new CudaMemAccessLogger<unsigned int>(d_control, dim*dim, "Control Data", memAccessStorage);
130+
131+
// change me and look, how the visulization looks like
132+
int const blockSize = 32;
133+
int const numBlocks = ((dim * dim) + blockSize - 1) / blockSize;
134+
135+
decrement<<<numBlocks, blockSize>>>(dim * dim, data, control);
136+
137+
checkCudaError(cudaGetLastError());
138+
139+
checkCudaError(cudaMemcpy(h_control.data(), d_control, buffer_size_byte, cudaMemcpyDeviceToHost));
140+
141+
// Generate the output json file for visualization
142+
// It needs to be below the memory copy or alternatively after a synchronize to make sure the kernel is finished
143+
memAccessStorage->generateJSONOutput("../../../out/random2DMatrix.json");
144+
145+
bool success = true;
146+
147+
for (auto y = 0; y < dim; ++y)
148+
{
149+
for (auto x = 0; x < dim; ++x)
150+
{
151+
if (h_control[y * dim + x] != h_data[y * dim + x])
152+
{
153+
std::cout << "h_control[" << y << ", " << x << "] != h_data[" << y << ", " << x << "]" << std::endl;
154+
std::cout << h_control[y * dim + x] << " != " << h_data[y * dim + x] << std::endl;
155+
success = false;
156+
}
157+
}
158+
}
159+
160+
checkCudaError(cudaMemcpy(h_data.data(), d_data, buffer_size_byte, cudaMemcpyDeviceToHost));
161+
162+
for (auto y = 0; y < dim; ++y)
163+
{
164+
for (auto x = 0; x < dim; ++x)
165+
{
166+
if (h_data[y * dim + x] != 0)
167+
{
168+
std::cout << "h_data[" << y << ", " << x << "] != 0" << std::endl;
169+
std::cout << "value is: " << h_data[y * dim + x] << std::endl;
170+
success = false;
171+
}
172+
}
173+
}
174+
175+
if (success)
176+
{
177+
std::cout << "The kernel worked correctly" << std::endl;
178+
}
179+
180+
checkCudaError(cudaFree(d_data));
181+
checkCudaError(cudaFree(d_control));
182+
183+
// Free the memory of the custom logging classes
184+
delete memAccessStorage;
185+
delete data;
186+
delete control;
187+
188+
return 0;
189+
}

0 commit comments

Comments
 (0)