@@ -5,6 +5,7 @@ import CUDA
5
5
import Adapt
6
6
import ClimaComms
7
7
import ClimaComms: CUDADevice, threaded
8
+ import ClimaComms: OneInterdependentItem, MultipleInterdependentItems
8
9
9
10
function ClimaComms. _assign_device (:: CUDADevice , rank_number)
10
11
CUDA. device! (rank_number % CUDA. ndevices ())
@@ -50,17 +51,29 @@ ClimaComms.elapsed(f::F, ::CUDADevice, args...; kwargs...) where {F} =
50
51
ClimaComms. assert (:: CUDADevice , cond:: C , text:: T ) where {C, T} =
51
52
isnothing (text) ? (CUDA. @cuassert cond ()) : (CUDA. @cuassert cond () text ())
52
53
53
- # TODO : Generalize all of the following code to multi-dimensional thread blocks
54
- # and multiple iterators.
54
+ ClimaComms. synchronize_gpu_threads (:: CUDADevice ) = CUDA. sync_threads ()
55
55
56
- # The number of threads in the kernel being executed by the calling thread.
57
- threads_in_kernel () = CUDA. blockDim (). x * CUDA. gridDim (). x
56
+ ClimaComms. static_shared_memory_array (
57
+ :: CUDADevice ,
58
+ :: Type{T} ,
59
+ dims... ,
60
+ ) where {T} = CUDA. CuStaticSharedArray (T, dims)
61
+
62
+ # Number of blocks in kernel being executed and index of calling thread's block.
63
+ blocks_in_kernel () = CUDA. gridDim (). x
64
+ block_idx_in_kernel () = CUDA. blockIdx (). x
58
65
59
- # The index of the calling thread, which is between 1 and threads_in_kernel().
60
- thread_index () =
66
+ # Number of threads in each block of kernel being executed and index of calling
67
+ # thread within its block.
68
+ threads_in_block () = CUDA. blockDim (). x
69
+ thread_idx_in_block () = CUDA. threadIdx (). x
70
+
71
+ # Total number of threads in kernel being executed and index of calling thread.
72
+ threads_in_kernel () = CUDA. blockDim (). x * CUDA. gridDim (). x
73
+ thread_idx_in_kernel () =
61
74
(CUDA. blockIdx (). x - 1 ) * CUDA. blockDim (). x + CUDA. threadIdx (). x
62
75
63
- # The maximum number of blocks that can fit on the GPU used for this kernel.
76
+ # Maximum number of blocks that can fit on the GPU used for this kernel.
64
77
grid_size_limit (kernel) = CUDA. attribute (
65
78
CUDA. device (kernel. fun. mod. ctx),
66
79
CUDA. DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
@@ -73,22 +86,22 @@ block_size_limit(max_threads_in_block::Int, _) = max_threads_in_block
73
86
block_size_limit (:: Val{:auto} , kernel) =
74
87
CUDA. launch_configuration (kernel. fun). threads
75
88
76
- function threaded (f:: F , :: CUDADevice , :: Val , itr; block_size) where {F}
89
+ function threaded (f:: F , device :: CUDADevice , :: Val , itr; block_size) where {F}
77
90
length (itr) > 0 || return nothing
78
91
Base. require_one_based_indexing (itr)
79
92
80
- function call_f_once_from_thread ()
81
- item_index = thread_index ()
82
- item_index <= length (itr) && @inbounds f (itr[item_index ])
93
+ function thread_function ()
94
+ itr_index = thread_idx_in_kernel ()
95
+ itr_index <= length (itr) && @inbounds f (itr[itr_index ])
83
96
return nothing
84
97
end
85
- kernel = CUDA. @cuda launch= false call_f_once_from_thread ()
98
+ kernel = CUDA. @cuda launch= false thread_function ()
86
99
max_blocks = grid_size_limit (kernel)
87
100
max_threads_in_block = block_size_limit (block_size, kernel)
88
101
89
102
# If there are too many items, coarsen by the smallest possible amount.
90
103
length (itr) <= max_blocks * max_threads_in_block ||
91
- return threaded (f, CUDADevice () , 1 , itr)
104
+ return threaded (f, device , 1 , itr; block_size )
92
105
93
106
threads_in_block = min (max_threads_in_block, length (itr))
94
107
blocks = cld (length (itr), threads_in_block)
@@ -102,17 +115,18 @@ function threaded(
102
115
itr;
103
116
block_size,
104
117
) where {F}
105
- min_items_in_thread > 0 || throw (ArgumentError (" `coarsen` is not positive" ))
118
+ min_items_in_thread > 0 ||
119
+ throw (ArgumentError (" integer `coarsen` value must be positive" ))
106
120
length (itr) > 0 || return nothing
107
121
Base. require_one_based_indexing (itr)
108
122
109
123
# Maximize memory coalescing with a "grid-stride loop"; for reference, see
110
124
# https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops
111
- call_f_multiple_times_from_thread () =
112
- for item_index in thread_index (): threads_in_kernel (): length (itr)
113
- @inbounds f (itr[item_index ])
125
+ coarsened_thread_function () =
126
+ for itr_index in thread_idx_in_kernel (): threads_in_kernel (): length (itr)
127
+ @inbounds f (itr[itr_index ])
114
128
end
115
- kernel = CUDA. @cuda launch= false call_f_multiple_times_from_thread ()
129
+ kernel = CUDA. @cuda launch= false coarsened_thread_function ()
116
130
max_blocks = grid_size_limit (kernel)
117
131
max_threads_in_block = block_size_limit (block_size, kernel)
118
132
@@ -129,4 +143,137 @@ function threaded(
129
143
CUDA. @sync kernel (; blocks, threads = threads_in_block)
130
144
end
131
145
146
+ function threaded (
147
+ f:: F ,
148
+ device:: CUDADevice ,
149
+ :: Union{Val, NTuple{2, Val}} ,
150
+ independent_itr,
151
+ interdependent_itr;
152
+ block_size,
153
+ ) where {F}
154
+ length (independent_itr) > 0 || return nothing
155
+ length (interdependent_itr) > 0 || return nothing
156
+ Base. require_one_based_indexing (independent_itr)
157
+ Base. require_one_based_indexing (interdependent_itr)
158
+
159
+ function two_itr_thread_function ()
160
+ block_index = block_idx_in_kernel ()
161
+ thread_index = thread_idx_in_block ()
162
+ (
163
+ block_index <= length (independent_itr) &&
164
+ thread_index <= length (interdependent_itr)
165
+ ) && @inbounds f (
166
+ independent_itr[block_index],
167
+ OneInterdependentItem (interdependent_itr[thread_index], device),
168
+ )
169
+ return nothing
170
+ end
171
+ kernel = CUDA. @cuda launch= false two_itr_thread_function ()
172
+ max_blocks = grid_size_limit (kernel)
173
+ max_threads_in_block = block_size_limit (block_size, kernel)
174
+
175
+ # If there are too many items, coarsen by the smallest possible amount.
176
+ (
177
+ length (independent_itr) <= max_blocks &&
178
+ length (interdependent_itr) <= max_threads_in_block
179
+ ) || return threaded (
180
+ f,
181
+ device,
182
+ (1 , 1 ),
183
+ independent_itr,
184
+ interdependent_itr;
185
+ block_size,
186
+ )
187
+
188
+ blocks = length (independent_itr)
189
+ threads_in_block = length (interdependent_itr)
190
+ CUDA. @sync kernel (; blocks, threads = threads_in_block)
191
+ end
192
+
193
+ # Use a default coarsen value of 1 for either iterator when a value is needed.
194
+ threaded (
195
+ f:: F ,
196
+ device:: CUDADevice ,
197
+ min_independent_items_in_thread:: Int ,
198
+ independent_itr,
199
+ interdependent_itr;
200
+ block_size,
201
+ ) where {F} = threaded (
202
+ f,
203
+ device,
204
+ (min_independent_items_in_thread, 1 ),
205
+ independent_itr,
206
+ interdependent_itr;
207
+ block_size,
208
+ )
209
+ threaded (
210
+ f:: F ,
211
+ device:: CUDADevice ,
212
+ min_items_in_thread:: Tuple{Val, Int} ,
213
+ independent_itr,
214
+ interdependent_itr;
215
+ block_size,
216
+ ) where {F} = threaded (
217
+ f,
218
+ device,
219
+ (1 , min_items_in_thread[2 ]),
220
+ independent_itr,
221
+ interdependent_itr;
222
+ block_size,
223
+ )
224
+
225
+ function threaded (
226
+ f:: F ,
227
+ device:: CUDADevice ,
228
+ min_items_in_thread:: NTuple{2, Int} ,
229
+ independent_itr,
230
+ interdependent_itr;
231
+ block_size,
232
+ ) where {F}
233
+ (min_items_in_thread[1 ] > 0 && min_items_in_thread[2 ] > 0 ) ||
234
+ throw (ArgumentError (" all integer `coarsen` values must be positive" ))
235
+ length (independent_itr) > 0 || return nothing
236
+ length (interdependent_itr) > 0 || return nothing
237
+ Base. require_one_based_indexing (independent_itr)
238
+ Base. require_one_based_indexing (interdependent_itr)
239
+
240
+ # Maximize memory coalescing with a "grid-stride loop" (reference is above).
241
+ function coarsened_two_itr_thread_function ()
242
+ independent_itr_indices =
243
+ block_idx_in_kernel (): blocks_in_kernel (): length (independent_itr)
244
+ interdependent_itr_indices =
245
+ thread_idx_in_block (): threads_in_block (): length (interdependent_itr)
246
+ for independent_itr_index in independent_itr_indices
247
+ @inbounds f (
248
+ independent_itr[independent_itr_index],
249
+ MultipleInterdependentItems (
250
+ interdependent_itr,
251
+ interdependent_itr_indices,
252
+ device,
253
+ ),
254
+ )
255
+ end
256
+ end
257
+ kernel = CUDA. @cuda launch= false coarsened_two_itr_thread_function ()
258
+ max_blocks = grid_size_limit (kernel)
259
+ max_threads_in_block = block_size_limit (block_size, kernel)
260
+
261
+ # If there are too many items to use the specified coarsening, increase it
262
+ # by the smallest possible amount.
263
+ max_required_blocks = cld (length (independent_itr), min_items_in_thread[1 ])
264
+ max_required_threads_in_block =
265
+ cld (length (interdependent_itr), min_items_in_thread[2 ])
266
+ items_in_thread = (
267
+ max_required_blocks <= max_blocks ? min_items_in_thread[1 ] :
268
+ cld (length (independent_itr), max_blocks),
269
+ max_required_threads_in_block <= max_threads_in_block ?
270
+ min_items_in_thread[2 ] :
271
+ cld (length (interdependent_itr), max_threads_in_block),
272
+ )
273
+
274
+ blocks = cld (length (independent_itr), items_in_thread[1 ])
275
+ threads_in_block = cld (length (interdependent_itr), items_in_thread[2 ])
276
+ CUDA. @sync kernel (; blocks, threads = threads_in_block)
277
+ end
278
+
132
279
end
0 commit comments