@@ -23,6 +23,7 @@ inline void checkCudaErrorFunc(cudaError_t err, const char *file, int line)
23
23
// Each local block is reduced to a single element.
24
24
// A grid stride loop maps the logical blocks to cuda blocks (both has the same size).
25
25
// The output array has the size of the number of logical blocks.
26
+ // Uses only global memory.
26
27
__global__ void reduce_gm (unsigned int const size, unsigned int *const input, unsigned int *const output)
27
28
{
28
29
int const id = threadIdx .x + blockIdx .x * blockDim .x ;
@@ -48,7 +49,49 @@ __global__ void reduce_gm(unsigned int const size, unsigned int *const input, un
48
49
}
49
50
}
50
51
51
- // Helper function -> should be replaced by html visualization ;-)
52
+ // The reduction algorithms divide all elements in logical blocks with the size of threads.
53
+ // Each local block is reduced to a single element.
54
+ // A grid stride loop maps the logical blocks to cuda blocks (both has the same size).
55
+ // The output array has the size of the number of logical blocks.
56
+ // Uses shared memory to speed up.
57
+ template <auto LOGICAL_BLOCK_SIZE>
58
+ __global__ void reduce_sm (unsigned int const size, unsigned int const upper_bound_size, unsigned int *const input, unsigned int *const output)
59
+ {
60
+ __shared__ unsigned int reduction_memory[LOGICAL_BLOCK_SIZE];
61
+
62
+ int const id = threadIdx .x + blockIdx .x * blockDim .x ;
63
+ int const stride = blockDim .x * gridDim .x ;
64
+ // use grid stride loop to distribute the logical blocks to cuda blocks.
65
+ for (int block_offset_id = id, virtual_block_id = blockIdx .x ; block_offset_id < upper_bound_size; block_offset_id += stride, virtual_block_id += gridDim .x )
66
+ {
67
+ if (block_offset_id < size)
68
+ {
69
+ reduction_memory[threadIdx .x ] = input[block_offset_id];
70
+ }
71
+ else
72
+ {
73
+ reduction_memory[threadIdx .x ] = 0 ;
74
+ }
75
+ // reduce all elements of logical block to a single element.
76
+ __syncthreads ();
77
+ for (int max_threads_blocks = blockDim .x / 2 ; max_threads_blocks > 0 ; max_threads_blocks /= 2 )
78
+ {
79
+ if (threadIdx .x < max_threads_blocks)
80
+ {
81
+ reduction_memory[threadIdx .x ] += reduction_memory[threadIdx .x + max_threads_blocks];
82
+ }
83
+ __syncthreads ();
84
+ }
85
+
86
+ if (threadIdx .x == 0 )
87
+ {
88
+ // write single element to output
89
+ output[virtual_block_id] = reduction_memory[0 ];
90
+ }
91
+ }
92
+ }
93
+
94
+ // Helper function -> should be replaced by html visualization ;-)
52
95
template <typename T>
53
96
void print_vec (std::vector<T> vec)
54
97
{
@@ -61,16 +104,16 @@ void print_vec(std::vector<T> vec)
61
104
62
105
int main (int argc, char **argv)
63
106
{
64
- int const blocks = 10 ;
65
- int const threads = 32 ;
107
+ int constexpr blocks = 10 ;
108
+ int constexpr threads = 32 ;
66
109
67
110
// number of input elements
68
- unsigned int const size = 1000 ;
111
+ unsigned int const size = 1632 ;
69
112
size_t const data_size_byte = sizeof (unsigned int ) * size;
70
113
71
114
// number of logical blocks
72
115
size_t output_elements = size / threads;
73
- // add an extra element, if logical blocks does not fit in cuda blocks
116
+ // add an extra element, if logical blocks does not fit in cuda blocks
74
117
output_elements += (size % threads == 0 ) ? 0 : 1 ;
75
118
size_t const output_size_byte = sizeof (unsigned int ) * output_elements;
76
119
@@ -98,15 +141,30 @@ int main(int argc, char **argv)
98
141
checkCudaError (cudaMalloc ((void **)&d_output, output_size_byte));
99
142
checkCudaError (cudaMemcpy (d_data, h_data.data (), data_size_byte, cudaMemcpyHostToDevice));
100
143
101
- reduce_gm<<<blocks, threads>>> (size, d_data, d_output);
144
+ bool const sm_version = false ;
145
+
146
+ if (!sm_version)
147
+ {
148
+ if (size % threads)
149
+ {
150
+ std::cerr << " size needs to be multiple of number of threads" << std::endl;
151
+ exit (1 );
152
+ }
153
+ reduce_gm<<<blocks, threads>>> (size, d_data, d_output);
154
+ }
155
+ else
156
+ {
157
+ size_t const upper_bound_size = output_elements * threads;
158
+ reduce_sm<threads><<<blocks, threads>>> (size, upper_bound_size, d_data, d_output);
159
+ }
102
160
checkCudaError (cudaGetLastError ());
103
161
104
162
checkCudaError (cudaMemcpy (h_output.data (), d_output, output_size_byte, cudaMemcpyDeviceToHost));
105
163
106
164
unsigned int sum = 0 ;
107
165
108
166
// Reduce all sums of the logical blocks on CPU.
109
- // Otherwise a second kernel or cuda cooperative groups are required to performe block synchronization.
167
+ // Otherwise a second kernel or cuda cooperative groups are required to performe block synchronization.
110
168
for (unsigned int const v : h_output)
111
169
{
112
170
sum += v;
0 commit comments