|
2 | 2 |
|
3 | 3 | const AbstractArrayOrBroadcasted = Union{AbstractArray,Broadcast.Broadcasted}
|
4 | 4 |
|
5 |
| -# GPUArrays' mapreduce methods build on `Base.mapreducedim!`, but with an additional |
6 |
| -# argument `init` value to avoid eager initialization of `R` (if set to something). |
7 |
| -mapreducedim!(f, op, R::AnyGPUArray, A::AbstractArrayOrBroadcasted; |
8 |
| - init=nothing) = error("Not implemented") # COV_EXCL_LINE |
| 5 | +# # GPUArrays' mapreduce methods build on `Base.mapreducedim!`, but with an additional |
| 6 | +# # argument `init` value to avoid eager initialization of `R` (if set to something). |
| 7 | +# mapreducedim!(f, op, R::AnyGPUArray, A::AbstractArrayOrBroadcasted; |
| 8 | +# init=nothing) = error("Not implemented") # COV_EXCL_LINE |
9 | 9 | # resolve ambiguities
|
10 | 10 | Base.mapreducedim!(f, op, R::AnyGPUArray, A::AbstractArray) = mapreducedim!(f, op, R, A)
|
11 | 11 | Base.mapreducedim!(f, op, R::AnyGPUArray, A::Broadcast.Broadcasted) = mapreducedim!(f, op, R, A)
|
@@ -142,3 +142,181 @@ function Base.:(==)(A::AnyGPUArray, B::AnyGPUArray)
|
142 | 142 | res = mapreduce(mapper, reducer, A, B; init=(; is_missing=false, is_equal=true))
|
143 | 143 | res.is_missing ? missing : res.is_equal
|
144 | 144 | end
|
| 145 | + |
| 146 | +@inline function reduce_group(@context, op, val::T, neutral, ::Val{maxitems}) where {T, maxitems} |
| 147 | + items = @groupsize[1] |
| 148 | + item = @index(Local, Linear) |
| 149 | + |
| 150 | + # local mem for a complete reduction |
| 151 | + shared = @localmem T (maxitems,) |
| 152 | + @inbounds shared[item] = val |
| 153 | + |
| 154 | + # perform a reduction |
| 155 | + d = 1 |
| 156 | + while d < items |
| 157 | + @synchronize() # legal since cpu=false |
| 158 | + index = 2 * d * (item-1) + 1 |
| 159 | + @inbounds if index <= items |
| 160 | + other_val = if index + d <= items |
| 161 | + shared[index+d] |
| 162 | + else |
| 163 | + neutral |
| 164 | + end |
| 165 | + shared[index] = op(shared[index], other_val) |
| 166 | + end |
| 167 | + d *= 2 |
| 168 | + end |
| 169 | + |
| 170 | + # load the final value on the first item |
| 171 | + if item == 1 |
| 172 | + val = @inbounds shared[item] |
| 173 | + end |
| 174 | + |
| 175 | + return val |
| 176 | +end |
| 177 | + |
| 178 | +Base.@propagate_inbounds _map_getindex(args::Tuple, I) = ((args[1][I]), _map_getindex(Base.tail(args), I)...) |
| 179 | +Base.@propagate_inbounds _map_getindex(args::Tuple{Any}, I) = ((args[1][I]),) |
| 180 | +Base.@propagate_inbounds _map_getindex(args::Tuple{}, I) = () |
| 181 | + |
| 182 | +# Reduce an array across the grid. All elements to be processed can be addressed by the |
| 183 | +# product of the two iterators `Rreduce` and `Rother`, where the latter iterator will have |
| 184 | +# singleton entries for the dimensions that should be reduced (and vice versa). |
| 185 | +@kernel cpu=false function partial_mapreduce_device(f, op, neutral, maxitems, Rreduce, Rother, R, As...) |
| 186 | + # decompose the 1D hardware indices into separate ones for reduction (across items |
| 187 | + # and possibly groups if it doesn't fit) and other elements (remaining groups) |
| 188 | + localIdx_reduce = @index(Local, Linear) |
| 189 | + localDim_reduce = @groupsize()[1] |
| 190 | + groupIdx_reduce, groupIdx_other = fldmod1(@index(Group, Linear), length(Rother)) |
| 191 | + numGroups = length(KernelAbstractions.blocks(KernelAbstractions.__iterspace(@context()))) |
| 192 | + groupDim_reduce = numGroups ÷ length(Rother) |
| 193 | + |
| 194 | + # group-based indexing into the values outside of the reduction dimension |
| 195 | + # (that means we can safely synchronize items within this group) |
| 196 | + iother = groupIdx_other |
| 197 | + @inbounds if iother <= length(Rother) |
| 198 | + Iother = Rother[iother] |
| 199 | + |
| 200 | + # load the neutral value |
| 201 | + Iout = CartesianIndex(Tuple(Iother)..., groupIdx_reduce) |
| 202 | + neutral = if neutral === nothing |
| 203 | + R[Iout] |
| 204 | + else |
| 205 | + neutral |
| 206 | + end |
| 207 | + |
| 208 | + val = op(neutral, neutral) |
| 209 | + |
| 210 | + # reduce serially across chunks of input vector that don't fit in a group |
| 211 | + ireduce = localIdx_reduce + (groupIdx_reduce - 1) * localDim_reduce |
| 212 | + while ireduce <= length(Rreduce) |
| 213 | + Ireduce = Rreduce[ireduce] |
| 214 | + J = max(Iother, Ireduce) |
| 215 | + val = op(val, f(_map_getindex(As, J)...)) |
| 216 | + ireduce += localDim_reduce * groupDim_reduce |
| 217 | + end |
| 218 | + |
| 219 | + val = reduce_group(@context(), op, val, neutral, maxitems) |
| 220 | + |
| 221 | + # write back to memory |
| 222 | + if localIdx_reduce == 1 |
| 223 | + R[Iout] = val |
| 224 | + end |
| 225 | + end |
| 226 | + |
| 227 | + return |
| 228 | +end |
| 229 | + |
| 230 | +## COV_EXCL_STOP |
| 231 | + |
| 232 | +function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyGPUArray{T}, A::AbstractArrayOrBroadcasted; |
| 233 | + init=nothing) where {F, OP, T} |
| 234 | + Base.check_reducedims(R, A) |
| 235 | + length(A) == 0 && return R # isempty(::Broadcasted) iterates |
| 236 | + |
| 237 | + # add singleton dimensions to the output container, if needed |
| 238 | + if ndims(R) < ndims(A) |
| 239 | + dims = Base.fill_to_length(size(R), 1, Val(ndims(A))) |
| 240 | + R = reshape(R, dims) |
| 241 | + end |
| 242 | + |
| 243 | + # iteration domain, split in two: one part covers the dimensions that should |
| 244 | + # be reduced, and the other covers the rest. combining both covers all values. |
| 245 | + Rall = CartesianIndices(axes(A)) |
| 246 | + Rother = CartesianIndices(axes(R)) |
| 247 | + Rreduce = CartesianIndices(ifelse.(axes(A) .== axes(R), Ref(Base.OneTo(1)), axes(A))) |
| 248 | + # NOTE: we hard-code `OneTo` (`first.(axes(A))` would work too) or we get a |
| 249 | + # CartesianIndices object with UnitRanges that behave badly on the GPU. |
| 250 | + @assert length(Rall) == length(Rother) * length(Rreduce) |
| 251 | + |
| 252 | + # allocate an additional, empty dimension to write the reduced value to. |
| 253 | + # this does not affect the actual location in memory of the final values, |
| 254 | + # but allows us to write a generalized kernel supporting partial reductions. |
| 255 | + R′ = reshape(R, (size(R)..., 1)) |
| 256 | + |
| 257 | + # how many items do we want? |
| 258 | + # |
| 259 | + # items in a group work together to reduce values across the reduction dimensions; |
| 260 | + # we want as many as possible to improve algorithm efficiency and execution occupancy. |
| 261 | + wanted_items = length(Rreduce) |
| 262 | + function compute_items(max_items) |
| 263 | + if wanted_items > max_items |
| 264 | + max_items |
| 265 | + else |
| 266 | + wanted_items |
| 267 | + end |
| 268 | + end |
| 269 | + |
| 270 | + # how many items can we launch? |
| 271 | + # |
| 272 | + # we might not be able to launch all those items to reduce each slice in one go. |
| 273 | + # that's why each items also loops across their inputs, processing multiple values |
| 274 | + # so that we can span the entire reduction dimension using a single item group. |
| 275 | + |
| 276 | + # group size is restricted by local memory |
| 277 | + # max_lmem_elements = compute_properties(device()).maxSharedLocalMemory ÷ sizeof(T) |
| 278 | + # max_items = min(compute_properties(device()).maxTotalGroupSize, |
| 279 | + # compute_items(max_lmem_elements ÷ 2)) |
| 280 | + # TODO: dynamic local memory to avoid two compilations |
| 281 | + |
| 282 | + # let the driver suggest a group size |
| 283 | + # args = (f, op, init, Val(max_items), Rreduce, Rother, R′, A) |
| 284 | + # kernel_args = kernel_convert.(args) |
| 285 | + # kernel_tt = Tuple{Core.Typeof.(kernel_args)...} |
| 286 | + # kernel = zefunction(partial_mapreduce_device, kernel_tt) |
| 287 | + # reduce_items = launch_configuration(kernel) |
| 288 | + reduce_items = 512 |
| 289 | + reduce_kernel = partial_mapreduce_device(get_backend(R), (reduce_items,)) |
| 290 | + |
| 291 | + # how many groups should we launch? |
| 292 | + # |
| 293 | + # even though we can always reduce each slice in a single item group, that may not be |
| 294 | + # optimal as it might not saturate the GPU. we already launch some groups to process |
| 295 | + # independent dimensions in parallel; pad that number to ensure full occupancy. |
| 296 | + other_groups = length(Rother) |
| 297 | + reduce_groups = cld(length(Rreduce), reduce_items) |
| 298 | + |
| 299 | + # determine the launch configuration |
| 300 | + items = reduce_items |
| 301 | + groups = reduce_groups*other_groups |
| 302 | + |
| 303 | + ndrange = groups*items |
| 304 | + |
| 305 | + # perform the actual reduction |
| 306 | + if reduce_groups == 1 |
| 307 | + # we can cover the dimensions to reduce using a single group |
| 308 | + reduce_kernel(f, op, init, Val(items), Rreduce, Rother, R′, A; ndrange) |
| 309 | + else |
| 310 | + # we need multiple steps to cover all values to reduce |
| 311 | + partial = similar(R, (size(R)..., reduce_groups)) |
| 312 | + if init === nothing |
| 313 | + # without an explicit initializer we need to copy from the output container |
| 314 | + partial .= R |
| 315 | + end |
| 316 | + reduce_kernel(f, op, init, Val(items), Rreduce, Rother, partial, A; ndrange) |
| 317 | + |
| 318 | + GPUArrays.mapreducedim!(identity, op, R′, partial; init=init) |
| 319 | + end |
| 320 | + |
| 321 | + return R |
| 322 | +end |
0 commit comments