Skip to content

Commit 36cbcee

Browse files
vchuravymaleadt
authored andcommitted
Add kernelabstractions based mapreduce implementation
1 parent b97643c commit 36cbcee

File tree

1 file changed

+182
-4
lines changed

1 file changed

+182
-4
lines changed

src/host/mapreduce.jl

+182-4
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
const AbstractArrayOrBroadcasted = Union{AbstractArray,Broadcast.Broadcasted}
44

5-
# GPUArrays' mapreduce methods build on `Base.mapreducedim!`, but with an additional
6-
# argument `init` value to avoid eager initialization of `R` (if set to something).
7-
mapreducedim!(f, op, R::AnyGPUArray, A::AbstractArrayOrBroadcasted;
8-
init=nothing) = error("Not implemented") # COV_EXCL_LINE
5+
# # GPUArrays' mapreduce methods build on `Base.mapreducedim!`, but with an additional
6+
# # argument `init` value to avoid eager initialization of `R` (if set to something).
7+
# mapreducedim!(f, op, R::AnyGPUArray, A::AbstractArrayOrBroadcasted;
8+
# init=nothing) = error("Not implemented") # COV_EXCL_LINE
99
# resolve ambiguities
1010
Base.mapreducedim!(f, op, R::AnyGPUArray, A::AbstractArray) = mapreducedim!(f, op, R, A)
1111
Base.mapreducedim!(f, op, R::AnyGPUArray, A::Broadcast.Broadcasted) = mapreducedim!(f, op, R, A)
@@ -142,3 +142,181 @@ function Base.:(==)(A::AnyGPUArray, B::AnyGPUArray)
142142
res = mapreduce(mapper, reducer, A, B; init=(; is_missing=false, is_equal=true))
143143
res.is_missing ? missing : res.is_equal
144144
end
145+
146+
@inline function reduce_group(@context, op, val::T, neutral, ::Val{maxitems}) where {T, maxitems}
147+
items = @groupsize[1]
148+
item = @index(Local, Linear)
149+
150+
# local mem for a complete reduction
151+
shared = @localmem T (maxitems,)
152+
@inbounds shared[item] = val
153+
154+
# perform a reduction
155+
d = 1
156+
while d < items
157+
@synchronize() # legal since cpu=false
158+
index = 2 * d * (item-1) + 1
159+
@inbounds if index <= items
160+
other_val = if index + d <= items
161+
shared[index+d]
162+
else
163+
neutral
164+
end
165+
shared[index] = op(shared[index], other_val)
166+
end
167+
d *= 2
168+
end
169+
170+
# load the final value on the first item
171+
if item == 1
172+
val = @inbounds shared[item]
173+
end
174+
175+
return val
176+
end
177+
178+
Base.@propagate_inbounds _map_getindex(args::Tuple, I) = ((args[1][I]), _map_getindex(Base.tail(args), I)...)
179+
Base.@propagate_inbounds _map_getindex(args::Tuple{Any}, I) = ((args[1][I]),)
180+
Base.@propagate_inbounds _map_getindex(args::Tuple{}, I) = ()
181+
182+
# Reduce an array across the grid. All elements to be processed can be addressed by the
183+
# product of the two iterators `Rreduce` and `Rother`, where the latter iterator will have
184+
# singleton entries for the dimensions that should be reduced (and vice versa).
185+
@kernel cpu=false function partial_mapreduce_device(f, op, neutral, maxitems, Rreduce, Rother, R, As...)
186+
# decompose the 1D hardware indices into separate ones for reduction (across items
187+
# and possibly groups if it doesn't fit) and other elements (remaining groups)
188+
localIdx_reduce = @index(Local, Linear)
189+
localDim_reduce = @groupsize()[1]
190+
groupIdx_reduce, groupIdx_other = fldmod1(@index(Group, Linear), length(Rother))
191+
numGroups = length(KernelAbstractions.blocks(KernelAbstractions.__iterspace(@context())))
192+
groupDim_reduce = numGroups ÷ length(Rother)
193+
194+
# group-based indexing into the values outside of the reduction dimension
195+
# (that means we can safely synchronize items within this group)
196+
iother = groupIdx_other
197+
@inbounds if iother <= length(Rother)
198+
Iother = Rother[iother]
199+
200+
# load the neutral value
201+
Iout = CartesianIndex(Tuple(Iother)..., groupIdx_reduce)
202+
neutral = if neutral === nothing
203+
R[Iout]
204+
else
205+
neutral
206+
end
207+
208+
val = op(neutral, neutral)
209+
210+
# reduce serially across chunks of input vector that don't fit in a group
211+
ireduce = localIdx_reduce + (groupIdx_reduce - 1) * localDim_reduce
212+
while ireduce <= length(Rreduce)
213+
Ireduce = Rreduce[ireduce]
214+
J = max(Iother, Ireduce)
215+
val = op(val, f(_map_getindex(As, J)...))
216+
ireduce += localDim_reduce * groupDim_reduce
217+
end
218+
219+
val = reduce_group(@context(), op, val, neutral, maxitems)
220+
221+
# write back to memory
222+
if localIdx_reduce == 1
223+
R[Iout] = val
224+
end
225+
end
226+
227+
return
228+
end
229+
230+
## COV_EXCL_STOP
231+
232+
function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyGPUArray{T}, A::AbstractArrayOrBroadcasted;
233+
init=nothing) where {F, OP, T}
234+
Base.check_reducedims(R, A)
235+
length(A) == 0 && return R # isempty(::Broadcasted) iterates
236+
237+
# add singleton dimensions to the output container, if needed
238+
if ndims(R) < ndims(A)
239+
dims = Base.fill_to_length(size(R), 1, Val(ndims(A)))
240+
R = reshape(R, dims)
241+
end
242+
243+
# iteration domain, split in two: one part covers the dimensions that should
244+
# be reduced, and the other covers the rest. combining both covers all values.
245+
Rall = CartesianIndices(axes(A))
246+
Rother = CartesianIndices(axes(R))
247+
Rreduce = CartesianIndices(ifelse.(axes(A) .== axes(R), Ref(Base.OneTo(1)), axes(A)))
248+
# NOTE: we hard-code `OneTo` (`first.(axes(A))` would work too) or we get a
249+
# CartesianIndices object with UnitRanges that behave badly on the GPU.
250+
@assert length(Rall) == length(Rother) * length(Rreduce)
251+
252+
# allocate an additional, empty dimension to write the reduced value to.
253+
# this does not affect the actual location in memory of the final values,
254+
# but allows us to write a generalized kernel supporting partial reductions.
255+
R′ = reshape(R, (size(R)..., 1))
256+
257+
# how many items do we want?
258+
#
259+
# items in a group work together to reduce values across the reduction dimensions;
260+
# we want as many as possible to improve algorithm efficiency and execution occupancy.
261+
wanted_items = length(Rreduce)
262+
function compute_items(max_items)
263+
if wanted_items > max_items
264+
max_items
265+
else
266+
wanted_items
267+
end
268+
end
269+
270+
# how many items can we launch?
271+
#
272+
# we might not be able to launch all those items to reduce each slice in one go.
273+
# that's why each items also loops across their inputs, processing multiple values
274+
# so that we can span the entire reduction dimension using a single item group.
275+
276+
# group size is restricted by local memory
277+
# max_lmem_elements = compute_properties(device()).maxSharedLocalMemory ÷ sizeof(T)
278+
# max_items = min(compute_properties(device()).maxTotalGroupSize,
279+
# compute_items(max_lmem_elements ÷ 2))
280+
# TODO: dynamic local memory to avoid two compilations
281+
282+
# let the driver suggest a group size
283+
# args = (f, op, init, Val(max_items), Rreduce, Rother, R′, A)
284+
# kernel_args = kernel_convert.(args)
285+
# kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
286+
# kernel = zefunction(partial_mapreduce_device, kernel_tt)
287+
# reduce_items = launch_configuration(kernel)
288+
reduce_items = 512
289+
reduce_kernel = partial_mapreduce_device(get_backend(R), (reduce_items,))
290+
291+
# how many groups should we launch?
292+
#
293+
# even though we can always reduce each slice in a single item group, that may not be
294+
# optimal as it might not saturate the GPU. we already launch some groups to process
295+
# independent dimensions in parallel; pad that number to ensure full occupancy.
296+
other_groups = length(Rother)
297+
reduce_groups = cld(length(Rreduce), reduce_items)
298+
299+
# determine the launch configuration
300+
items = reduce_items
301+
groups = reduce_groups*other_groups
302+
303+
ndrange = groups*items
304+
305+
# perform the actual reduction
306+
if reduce_groups == 1
307+
# we can cover the dimensions to reduce using a single group
308+
reduce_kernel(f, op, init, Val(items), Rreduce, Rother, R′, A; ndrange)
309+
else
310+
# we need multiple steps to cover all values to reduce
311+
partial = similar(R, (size(R)..., reduce_groups))
312+
if init === nothing
313+
# without an explicit initializer we need to copy from the output container
314+
partial .= R
315+
end
316+
reduce_kernel(f, op, init, Val(items), Rreduce, Rother, partial, A; ndrange)
317+
318+
GPUArrays.mapreducedim!(identity, op, R′, partial; init=init)
319+
end
320+
321+
return R
322+
end

0 commit comments

Comments
 (0)