From fbc0ced33cb4122319f62805f9640a0aa5fbe36d Mon Sep 17 00:00:00 2001 From: brabreda Date: Fri, 24 Feb 2023 13:15:05 +0100 Subject: [PATCH 1/5] addedwarp and block reduce --- lib/CUDAKernels/src/CUDAKernels.jl | 42 ++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl index 5e49ed666..85ef781d8 100644 --- a/lib/CUDAKernels/src/CUDAKernels.jl +++ b/lib/CUDAKernels/src/CUDAKernels.jl @@ -407,4 +407,46 @@ Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental # Argument conversion KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(arg) +# reduce block +@device_override @inline function reduce(T,val) + shared = CUDA.CuStaticSharedArray(T, 32) + + Sid, lane = fldmod1(CUDA.threadIdx().x, CUDA.warpsize()) + + val = reduce_warp(val) + + # Enkel de eerste lane van iedere SIMD unit mag de waarde naar shared memory schrijven + if lane == 1 + @inbounds shared[Sid] = val + end + + CUDA.sync_threads() + + # de eerste 32 values worden in val gestoken, als er er minder dan 32 warps passen in 1 block + # dan vullen we de rest op met nullen + val = if CUDA.threadIdx().x <= fld1(CUDA.blockDim().x, CUDA.warpsize()) + @inbounds shared[lane] + else + 0 + end + + # final reduce within first warp + if Sid == 1 + val = reduce_warp(val) + end + return val end + +function reduce_warp(val) + offset = 0x00000001 + while offset < CUDA.warpsize() + + val += CUDA.shfl_down_sync(0xffffffff, val, offset) + offset <<= 1 + end + + return val +end + +end + From a34ac1ad371f26984f08eb4e9a6714c5e92c158e Mon Sep 17 00:00:00 2001 From: brabreda Date: Tue, 7 Mar 2023 00:36:11 +0100 Subject: [PATCH 2/5] added op to groupreduce --- lib/CUDAKernels/src/CUDAKernels.jl | 16 +++++++++------- src/KernelAbstractions.jl | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl index 85ef781d8..6074f699e 100644 --- a/lib/CUDAKernels/src/CUDAKernels.jl +++ b/lib/CUDAKernels/src/CUDAKernels.jl @@ -323,6 +323,8 @@ import CUDA: @device_override import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds +import KernelAbstractions: __reduce + function mkcontext(kernel::Kernel{<:CUDADevice}, _ndrange, iterspace) CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace) @@ -407,13 +409,14 @@ Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental # Argument conversion KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(arg) -# reduce block -@device_override @inline function reduce(T,val) + +# group reduce that uses warp level reduction +@device_override @inline function __reduce(op, val, ::Type{T}) where T shared = CUDA.CuStaticSharedArray(T, 32) Sid, lane = fldmod1(CUDA.threadIdx().x, CUDA.warpsize()) - val = reduce_warp(val) + val = reduce_warp(op, val) # Enkel de eerste lane van iedere SIMD unit mag de waarde naar shared memory schrijven if lane == 1 @@ -432,16 +435,15 @@ KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(a # final reduce within first warp if Sid == 1 - val = reduce_warp(val) + val = reduce_warp(op, val) end return val end -function reduce_warp(val) +@inline function reduce_warp(op, val) offset = 0x00000001 while offset < CUDA.warpsize() - - val += CUDA.shfl_down_sync(0xffffffff, val, offset) + val = op(val, CUDA.shfl_down_sync(0xffffffff, val, offset)) offset <<= 1 end diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 30d1cd75f..c5f9f104d 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -4,6 +4,7 @@ export @kernel export @Const, @localmem, @private, @uniform, @synchronize export @index, @groupsize, @ndrange export @print +export @reduce export Device, GPU, CPU, Event, MultiEvent, NoneEvent export async_copy! @@ -329,6 +330,14 @@ macro index(locale, args...) Expr(:call, GlobalRef(KernelAbstractions, index_function), esc(:__ctx__), map(esc, args)...) end +# TODO: Where should we havdle the logic of neutral, adding it to the macro's logic would reduce complexity in terms of using the macro +# but adding it to the macro may cause some overhead +macro reduce(op, val) + quote + $__reduce($(esc(op)), $(esc(val)), typeof($(esc(val)))) + end +end + ### # Internal kernel functions ### @@ -493,6 +502,7 @@ function __synchronize() error("@synchronize used outside kernel or not captured") end + @generated function __print(items...) str = "" args = [] @@ -515,6 +525,14 @@ end __size(args::Tuple) = Tuple{args...} __size(i::Int) = Tuple{i} + +# reduction +function __reduce(op, val, ::Type{T}) where T + error("@reduce used outside kernel or not captured") +end + + + ### # Extras # - LoopInfo From 66163f459054b689821e57b3696916cf5eaf27fe Mon Sep 17 00:00:00 2001 From: brabreda Date: Wed, 5 Apr 2023 20:12:09 +0200 Subject: [PATCH 3/5] add reduction macro --- lib/CUDAKernels/src/CUDAKernels.jl | 61 ++++++++++++++---------------- src/KernelAbstractions.jl | 6 +-- 2 files changed, 30 insertions(+), 37 deletions(-) diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl index 6074f699e..cc4365762 100644 --- a/lib/CUDAKernels/src/CUDAKernels.jl +++ b/lib/CUDAKernels/src/CUDAKernels.jl @@ -409,44 +409,39 @@ Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental # Argument conversion KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(arg) +# TODO: make variable block size possible +# TODO: figure out where to place this # group reduce that uses warp level reduction -@device_override @inline function __reduce(op, val, ::Type{T}) where T - shared = CUDA.CuStaticSharedArray(T, 32) - - Sid, lane = fldmod1(CUDA.threadIdx().x, CUDA.warpsize()) - - val = reduce_warp(op, val) - - # Enkel de eerste lane van iedere SIMD unit mag de waarde naar shared memory schrijven - if lane == 1 - @inbounds shared[Sid] = val - end - - CUDA.sync_threads() - - # de eerste 32 values worden in val gestoken, als er er minder dan 32 warps passen in 1 block - # dan vullen we de rest op met nullen - val = if CUDA.threadIdx().x <= fld1(CUDA.blockDim().x, CUDA.warpsize()) - @inbounds shared[lane] - else - 0 +@device_override @inline function __reduce(__ctx__ , op, val, neutral, ::Type{T}) where T + threads = KernelAbstractions.@groupsize()[1] + threadIdx = KernelAbstractions.@index(Local) + + # shared mem for a complete reduction + shared = KernelAbstractions.@localmem(T, 1024) + @inbounds shared[threadIdx] = val + + # perform the reduction + d = 1 + while d < threads + KernelAbstractions.@synchronize() + index = 2 * d * (threadIdx-1) + 1 + @inbounds if index <= threads + other_val = if index + d <= threads + shared[index+d] + else + neutral + end + shared[index] = op(shared[index], other_val) + end + d *= 2 end - # final reduce within first warp - if Sid == 1 - val = reduce_warp(op, val) + # load the final value on the first thread + if threadIdx == 1 + val = @inbounds shared[threadIdx] end - return val -end - -@inline function reduce_warp(op, val) - offset = 0x00000001 - while offset < CUDA.warpsize() - val = op(val, CUDA.shfl_down_sync(0xffffffff, val, offset)) - offset <<= 1 - end - + # every thread will return the reduced value of the group return val end diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index c5f9f104d..2b840acf1 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -332,9 +332,9 @@ end # TODO: Where should we havdle the logic of neutral, adding it to the macro's logic would reduce complexity in terms of using the macro # but adding it to the macro may cause some overhead -macro reduce(op, val) +macro reduce(op, val, neutral) quote - $__reduce($(esc(op)), $(esc(val)), typeof($(esc(val)))) + $__reduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), typeof($(esc(val)))) end end @@ -531,8 +531,6 @@ function __reduce(op, val, ::Type{T}) where T error("@reduce used outside kernel or not captured") end - - ### # Extras # - LoopInfo From ab3d6d10ef62733a3813bd0cc5cb1a9f84a6a5fd Mon Sep 17 00:00:00 2001 From: brabreda Date: Wed, 5 Apr 2023 20:15:38 +0200 Subject: [PATCH 4/5] add reduction macro --- lib/CUDAKernels/src/CUDAKernels.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl index cc4365762..f78bbec8f 100644 --- a/lib/CUDAKernels/src/CUDAKernels.jl +++ b/lib/CUDAKernels/src/CUDAKernels.jl @@ -411,8 +411,7 @@ KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(a # TODO: make variable block size possible # TODO: figure out where to place this - -# group reduce that uses warp level reduction +# reduction functionality for a group @device_override @inline function __reduce(__ctx__ , op, val, neutral, ::Type{T}) where T threads = KernelAbstractions.@groupsize()[1] threadIdx = KernelAbstractions.@index(Local) From db024ed7790eb7e521c971f55bcbd7c57045d061 Mon Sep 17 00:00:00 2001 From: brabreda Date: Wed, 12 Apr 2023 01:01:53 +0200 Subject: [PATCH 5/5] added reduce file --- lib/CUDAKernels/src/CUDAKernels.jl | 44 +++++-------------------- src/KernelAbstractions.jl | 17 ++++++++-- src/reduce.jl | 52 ++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 39 deletions(-) create mode 100644 src/reduce.jl diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl index f78bbec8f..d193c002a 100644 --- a/lib/CUDAKernels/src/CUDAKernels.jl +++ b/lib/CUDAKernels/src/CUDAKernels.jl @@ -323,7 +323,6 @@ import CUDA: @device_override import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds -import KernelAbstractions: __reduce function mkcontext(kernel::Kernel{<:CUDADevice}, _ndrange, iterspace) @@ -400,6 +399,14 @@ end CUDA._cuprint(args...) end +import KernelAbstractions: __test + +@device_override @inline function __test(__ctx__, conf) + KernelAbstractions.@localmem Float64 conf.threads_per_block + + KernelAbstractions.@print("dit werkt") +end + ### # GPU implementation of const memory ### @@ -409,40 +416,5 @@ Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental # Argument conversion KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(arg) -# TODO: make variable block size possible -# TODO: figure out where to place this -# reduction functionality for a group -@device_override @inline function __reduce(__ctx__ , op, val, neutral, ::Type{T}) where T - threads = KernelAbstractions.@groupsize()[1] - threadIdx = KernelAbstractions.@index(Local) - - # shared mem for a complete reduction - shared = KernelAbstractions.@localmem(T, 1024) - @inbounds shared[threadIdx] = val - - # perform the reduction - d = 1 - while d < threads - KernelAbstractions.@synchronize() - index = 2 * d * (threadIdx-1) + 1 - @inbounds if index <= threads - other_val = if index + d <= threads - shared[index+d] - else - neutral - end - shared[index] = op(shared[index], other_val) - end - d *= 2 - end - - # load the final value on the first thread - if threadIdx == 1 - val = @inbounds shared[threadIdx] - end - # every thread will return the reduced value of the group - return val -end - end diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 2b840acf1..7db68c362 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -330,14 +330,18 @@ macro index(locale, args...) Expr(:call, GlobalRef(KernelAbstractions, index_function), esc(:__ctx__), map(esc, args)...) end -# TODO: Where should we havdle the logic of neutral, adding it to the macro's logic would reduce complexity in terms of using the macro -# but adding it to the macro may cause some overhead macro reduce(op, val, neutral) quote $__reduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), typeof($(esc(val)))) end end +macro test(conf) + quote + $__test($(esc(:__ctx__)),$(esc(conf))) + end +end + ### # Internal kernel functions ### @@ -527,19 +531,26 @@ __size(i::Int) = Tuple{i} # reduction -function __reduce(op, val, ::Type{T}) where T +function __reduce(__ctx__, op, val, ::Type{T}) where T error("@reduce used outside kernel or not captured") end +function __test(__ctx__, conf) + error("@test used outside kernel or not captured") +end + ### # Extras # - LoopInfo ### + include("extras/extras.jl") include("reflection.jl") +include("reduce.jl") + # CPU backend include("cpu.jl") diff --git a/src/reduce.jl b/src/reduce.jl new file mode 100644 index 000000000..71b19a4a2 --- /dev/null +++ b/src/reduce.jl @@ -0,0 +1,52 @@ +struct Config{ + THREADS_PER_WARP, # size of warp + THREADS_PER_BLOCK # size of blocks + } +end + +@inline function Base.getproperty(conf::Type{Config{ THREADS_PER_WARP, THREADS_PER_BLOCK}}, sym::Symbol) where { THREADS_PER_WARP, THREADS_PER_BLOCK} + if sym == :threads_per_warp + THREADS_PER_WARP + elseif sym == :threads_per_block + THREADS_PER_BLOCK + else + # fallback for nothing + getfield(conf, sym) + end +end + +# TODO: make variable block size possible +# TODO: figure out where to place this +# reduction functionality for a group +@inline function __reduce(__ctx__ , op, val, neutral, ::Type{T}) where {T} + threads = KernelAbstractions.@groupsize()[1] + threadIdx = KernelAbstractions.@index(Local) + + # shared mem for a complete reduction + shared = KernelAbstractions.@localmem(T, 1024) + @inbounds shared[threadIdx] = val + + # perform the reduction + d = 1 + while d < threads + KernelAbstractions.@synchronize() + index = 2 * d * (threadIdx-1) + 1 + @inbounds if index <= threads + other_val = if index + d <= threads + shared[index+d] + else + neutral + end + shared[index] = op(shared[index], other_val) + end + d *= 2 + end + + # load the final value on the first thread + if threadIdx == 1 + val = @inbounds shared[threadIdx] + end + + # every thread will return the reduced value of the group + return val +end