Skip to content

Commit 8273eb1

Browse files
committed
Annotate all examples, function to generate just JSON, small improvments
For every example there is now an annotated version (which is also in the makefile) and the original version of that code. (It is mostly sufficient to change about 12 lines of code). The cuda_mav class has been improved, by being able to directly produce JSON files and strings as output. The currentSize and originalSize have been added to the output, so you know if it was not possible to store all data which was actually accessed. Instead of a tuple, a dedicated struct is now used for memory regions. Templates have been moved to their own dedicated directory. To reduce git load times a correct gitignore was already added to the html folder. This will be solved later by the merge into master anyways, but eases local development.
1 parent 60afcc9 commit 8273eb1

File tree

11 files changed

+618
-111
lines changed

11 files changed

+618
-111
lines changed

examples/basic/CMakeLists.txt

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,6 @@ set_target_properties(${_TARGET_NORMAL} PROPERTIES
1313
# add include path to the cuda_mav.cuh header
1414
target_link_libraries(${_TARGET_NORMAL} PRIVATE CUDA_MAV_HEADER)
1515

16-
set(_TARGET_ANNOTATED annotated)
17-
18-
add_executable(${_TARGET_ANNOTATED})
19-
target_sources(${_TARGET_ANNOTATED}
20-
PRIVATE
21-
annotated.cu)
22-
set_target_properties(${_TARGET_ANNOTATED} PROPERTIES
23-
CUDA_CXX_STANDARD 17
24-
)
25-
2616
set(_TARGET_RANDOM_2D_MATRIX random2dMatrix)
2717

2818
add_executable(${_TARGET_RANDOM_2D_MATRIX})

examples/basic/normal.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ int main(){
4949
checkCudaError(cudaGetLastError());
5050
cudaDeviceSynchronize();
5151

52-
memAccessStorage->generateOutput("../../../html/template.json", "../../../out/basic.json", CudaMemAccessStorage<int>::parseDataForJSPage);
52+
memAccessStorage->generateTemplatedOutput("../../../templates/basic_template.html", "../../../out/basic_html.html",
53+
CudaMemAccessStorage<int>::parseDataForStaticHTML);
5354

5455
checkCudaError(cudaMemcpy(h_output.data(), d_output, sizeof(int)*prob_size, cudaMemcpyDeviceToHost));
5556

examples/basic/normal_original.cu

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#include <vector>
2+
#include <numeric>
3+
#include <iostream>
4+
5+
inline void checkCudaError(cudaError_t err) {
6+
if (err != cudaSuccess) {
7+
std::cerr << "\rCuda Error " << err << ": " << cudaGetErrorString(err) << std::endl;
8+
std::cerr << "Aborting..." << std::endl;
9+
exit(1);
10+
}
11+
}
12+
13+
14+
__global__ void kernel(int prob_size, int * const input, int * output){
15+
int id = threadIdx.x + blockIdx.x * blockDim.x;
16+
if(id < prob_size){
17+
output[id] = input[id];
18+
}
19+
}
20+
21+
int main(){
22+
constexpr int prob_size = 100;
23+
24+
std::vector<int> h_input(prob_size);
25+
std::iota(h_input.begin(), h_input.end(), 0);
26+
int * d_input = nullptr;
27+
checkCudaError(cudaMalloc((void**) &d_input, sizeof(int)*prob_size));
28+
29+
std::vector<int> h_output(prob_size, 0);
30+
int * d_output = nullptr;
31+
checkCudaError(cudaMalloc((void**) &d_output, sizeof(int)*prob_size));
32+
33+
checkCudaError(cudaMemcpy(d_input, h_input.data(), sizeof(int)* prob_size, cudaMemcpyHostToDevice));
34+
35+
constexpr int threads = 32;
36+
constexpr int blocks = (prob_size/threads)+1;
37+
38+
kernel<<<blocks, threads>>>(prob_size, d_input, d_output);
39+
checkCudaError(cudaGetLastError());
40+
41+
checkCudaError(cudaMemcpy(h_output.data(), d_output, sizeof(int)*prob_size, cudaMemcpyDeviceToHost));
42+
43+
for(auto i = 0; i < h_input.size(); ++i){
44+
if(h_input[i] != h_output[i]){
45+
std::cerr << "Element at position " << i << "is not equal (input - output): " << h_input[i] << " != " << h_output[i] << std::endl;
46+
std::exit(1);
47+
}
48+
}
49+
50+
checkCudaError(cudaFree(d_input));
51+
checkCudaError(cudaFree(d_output));
52+
53+
std::cout << "kernel finished successful" << std::endl;
54+
return 0;
55+
}

examples/basic/random2dMatrix.cu

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <vector>
66
#include <random>
77
#include <algorithm>
8+
#include "../../include/cuda_mav.cuh"
89

910
// The wrapper macro is required, that __LINE__ is correct pointing to the line, where the check fails
1011
#define checkCudaError(ans) \
@@ -22,17 +23,17 @@ inline void checkCudaErrorFunc(cudaError_t err, const char *file, int line)
2223
}
2324
}
2425

25-
__global__ void decrement(unsigned int const size, unsigned int *data, unsigned int *control)
26+
__global__ void decrement(unsigned int const size, CudaMemAccessLogger<unsigned int> *data, CudaMemAccessLogger<unsigned int> *control)
2627
{
2728
int index = threadIdx.x + blockIdx.x * blockDim.x;
2829
int stride = blockDim.x * gridDim.x;
2930

3031
for (int i = index; i < size; i += stride)
3132
{
32-
while (data[i] > 0)
33+
while ((*data)[i] > 0)
3334
{
34-
data[i] = data[i] - 1;
35-
control[i] = control[i] + 1;
35+
(*data)[i] = (*data)[i] - 1;
36+
(*control)[i] = (*control)[i] + 1;
3637
}
3738
}
3839
}
@@ -82,10 +83,10 @@ int main(int argc, char **argv)
8283
{ return distribution(generator); });
8384

8485
// enable me, to create a hot spot area
85-
// the hot spot area should looks interessting in the memory access visualization
86-
if (false)
86+
// the hot spot area should looks interesting in the memory access visualization
87+
if (true)
8788
{
88-
hot_spot(h_data, dim, 8, 10, 10, 3);
89+
hot_spot(h_data, dim, 8, 10, 10, 20);
8990
}
9091

9192
// enable me, to print the matrix
@@ -120,15 +121,27 @@ int main(int argc, char **argv)
120121
// copy h_controll to initialize all values with 0 on the GPU
121122
checkCudaError(cudaMemcpy(d_control, h_control.data(), buffer_size_byte, cudaMemcpyHostToDevice));
122123

124+
// Define amount of accesses you want to log and create a memory object which stores them
125+
auto* memAccessStorage = new CudaMemAccessStorage<unsigned int>(dim * dim * 50);
126+
127+
// Wrap the data classes with the custom logging class
128+
auto* data = new CudaMemAccessLogger<unsigned int>(d_data, dim*dim, "Decremented Data", memAccessStorage);
129+
auto* control = new CudaMemAccessLogger<unsigned int>(d_control, dim*dim, "Control Data", memAccessStorage);
130+
123131
// change me and look, how the visulization looks like
124132
int const blockSize = 32;
125133
int const numBlocks = ((dim * dim) + blockSize - 1) / blockSize;
126134

127-
decrement<<<numBlocks, blockSize>>>(dim * dim, d_data, d_control);
135+
decrement<<<numBlocks, blockSize>>>(dim * dim, data, control);
136+
128137
checkCudaError(cudaGetLastError());
129138

130139
checkCudaError(cudaMemcpy(h_control.data(), d_control, buffer_size_byte, cudaMemcpyDeviceToHost));
131140

141+
// Generate the output json file for visualization
142+
// It needs to be below the memory copy or alternatively after a synchronize to make sure the kernel is finished
143+
memAccessStorage->generateJSONOutput("../../../out/random2DMatrix.json");
144+
132145
bool success = true;
133146

134147
for (auto y = 0; y < dim; ++y)
@@ -167,5 +180,10 @@ int main(int argc, char **argv)
167180
checkCudaError(cudaFree(d_data));
168181
checkCudaError(cudaFree(d_control));
169182

183+
// Free the memory of the custom logging classes
184+
delete memAccessStorage;
185+
delete data;
186+
delete control;
187+
170188
return 0;
171189
}
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// The applicaiton creates a 2D matrix and initialize each element randomly with a value between 0 and 10.
2+
// The kernel is simply decrementing each element until 0 in a very ineffective way.
3+
4+
#include <iostream>
5+
#include <vector>
6+
#include <random>
7+
#include <algorithm>
8+
9+
// The wrapper macro is required, that __LINE__ is correct pointing to the line, where the check fails
10+
#define checkCudaError(ans) \
11+
{ \
12+
checkCudaErrorFunc((ans), __FILE__, __LINE__); \
13+
}
14+
15+
inline void checkCudaErrorFunc(cudaError_t err, const char *file, int line)
16+
{
17+
if (err != cudaSuccess)
18+
{
19+
std::cout << "\r" << file << ":" << line << " -> Cuda Error " << err << ": " << cudaGetErrorString(err) << std::endl;
20+
std::cout << "Aborting..." << std::endl;
21+
exit(0);
22+
}
23+
}
24+
25+
__global__ void decrement(unsigned int const size, unsigned int *data, unsigned int *control)
26+
{
27+
int index = threadIdx.x + blockIdx.x * blockDim.x;
28+
int stride = blockDim.x * gridDim.x;
29+
30+
for (int i = index; i < size; i += stride)
31+
{
32+
while (data[i] > 0)
33+
{
34+
data[i] = data[i] - 1;
35+
control[i] = control[i] + 1;
36+
}
37+
}
38+
}
39+
40+
/// @brief Increment all values in a specific area by the value of increment. The maximum value of an entry is clamp to 10.
41+
/// @param data Data to increment.
42+
/// @param dim Dimensions of the 2D matrix.
43+
/// @param y_start Y start coordinate of the area to increment.
44+
/// @param x_start X start coordinate of the area to increment.
45+
/// @param size Size of the Y and X direction of the area to increment.
46+
/// @param increment Value to increment.
47+
void hot_spot(std::vector<unsigned int> &data, unsigned int const dim, unsigned int const y_start, unsigned int const x_start, unsigned int const size, unsigned int const increment)
48+
{
49+
for (auto y = y_start; y < y_start + size; ++y)
50+
{
51+
for (auto x = x_start; x < x_start + size; ++x)
52+
{
53+
if (data[y * dim + x] + increment > 10)
54+
{
55+
data[y * dim + x] = 10;
56+
}
57+
else
58+
{
59+
data[y * dim + x] += increment;
60+
}
61+
}
62+
}
63+
}
64+
65+
int main(int argc, char **argv)
66+
{
67+
unsigned int dim = 100;
68+
69+
std::vector<unsigned int> h_data(dim * dim);
70+
// create a 2D matrix where all elements are 0
71+
std::vector<unsigned int> h_control(dim * dim, 0);
72+
73+
// initialize data matrix with random numbers betweem 0 and 10
74+
std::uniform_int_distribution<unsigned int> distribution(
75+
0,
76+
10);
77+
std::default_random_engine generator;
78+
std::generate(
79+
h_data.begin(),
80+
h_data.end(),
81+
[&distribution, &generator]()
82+
{ return distribution(generator); });
83+
84+
// enable me, to create a hot spot area
85+
// the hot spot area should looks interessting in the memory access visualization
86+
if (false)
87+
{
88+
hot_spot(h_data, dim, 8, 10, 10, 3);
89+
}
90+
91+
// enable me, to print the matrix
92+
if (false)
93+
{
94+
for (auto y = 0; y < dim; ++y)
95+
{
96+
for (auto x = 0; x < dim; ++x)
97+
{
98+
if (h_data[y * dim + x] < 10)
99+
{
100+
std::cout << " " << h_data[y * dim + x] << " ";
101+
}
102+
else
103+
{
104+
std::cout << h_data[y * dim + x] << " ";
105+
}
106+
}
107+
std::cout << std::endl;
108+
}
109+
}
110+
111+
unsigned int *d_data = nullptr;
112+
unsigned int *d_control = nullptr;
113+
114+
size_t const buffer_size_byte = sizeof(unsigned int) * dim * dim;
115+
116+
checkCudaError(cudaMalloc((void **)&d_data, buffer_size_byte));
117+
checkCudaError(cudaMalloc((void **)&d_control, buffer_size_byte));
118+
119+
checkCudaError(cudaMemcpy(d_data, h_data.data(), buffer_size_byte, cudaMemcpyHostToDevice));
120+
// copy h_controll to initialize all values with 0 on the GPU
121+
checkCudaError(cudaMemcpy(d_control, h_control.data(), buffer_size_byte, cudaMemcpyHostToDevice));
122+
123+
// change me and look, how the visulization looks like
124+
int const blockSize = 32;
125+
int const numBlocks = ((dim * dim) + blockSize - 1) / blockSize;
126+
127+
decrement<<<numBlocks, blockSize>>>(dim * dim, d_data, d_control);
128+
checkCudaError(cudaGetLastError());
129+
130+
checkCudaError(cudaMemcpy(h_control.data(), d_control, buffer_size_byte, cudaMemcpyDeviceToHost));
131+
132+
bool success = true;
133+
134+
for (auto y = 0; y < dim; ++y)
135+
{
136+
for (auto x = 0; x < dim; ++x)
137+
{
138+
if (h_control[y * dim + x] != h_data[y * dim + x])
139+
{
140+
std::cout << "h_control[" << y << ", " << x << "] != h_data[" << y << ", " << x << "]" << std::endl;
141+
std::cout << h_control[y * dim + x] << " != " << h_data[y * dim + x] << std::endl;
142+
success = false;
143+
}
144+
}
145+
}
146+
147+
checkCudaError(cudaMemcpy(h_data.data(), d_data, buffer_size_byte, cudaMemcpyDeviceToHost));
148+
149+
for (auto y = 0; y < dim; ++y)
150+
{
151+
for (auto x = 0; x < dim; ++x)
152+
{
153+
if (h_data[y * dim + x] != 0)
154+
{
155+
std::cout << "h_data[" << y << ", " << x << "] != 0" << std::endl;
156+
std::cout << "value is: " << h_data[y * dim + x] << std::endl;
157+
success = false;
158+
}
159+
}
160+
}
161+
162+
if (success)
163+
{
164+
std::cout << "The kernel worked correctly" << std::endl;
165+
}
166+
167+
checkCudaError(cudaFree(d_data));
168+
checkCudaError(cudaFree(d_control));
169+
170+
return 0;
171+
}

0 commit comments

Comments
 (0)