From fbc0ced33cb4122319f62805f9640a0aa5fbe36d Mon Sep 17 00:00:00 2001
From: brabreda <bram.bfh@gmail.com>
Date: Fri, 24 Feb 2023 13:15:05 +0100
Subject: [PATCH 1/5] addedwarp and block reduce

---
 lib/CUDAKernels/src/CUDAKernels.jl | 42 ++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl
index 5e49ed666..85ef781d8 100644
--- a/lib/CUDAKernels/src/CUDAKernels.jl
+++ b/lib/CUDAKernels/src/CUDAKernels.jl
@@ -407,4 +407,46 @@ Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental
 # Argument conversion
 KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(arg)
 
+# reduce block
+@device_override @inline function reduce(T,val)
+    shared = CUDA.CuStaticSharedArray(T, 32)
+
+    Sid, lane = fldmod1(CUDA.threadIdx().x, CUDA.warpsize())
+
+    val = reduce_warp(val)
+
+    # Enkel de eerste lane van iedere SIMD unit mag de waarde naar shared memory schrijven
+    if lane == 1
+        @inbounds shared[Sid] = val
+    end
+
+    CUDA.sync_threads()
+
+    # de eerste 32 values worden in val gestoken, als er er minder dan 32 warps passen in 1 block
+    # dan vullen we de rest op met nullen
+    val = if CUDA.threadIdx().x <= fld1(CUDA.blockDim().x, CUDA.warpsize())
+            @inbounds shared[lane]
+    else
+        0
+    end
+
+    # final reduce within first warp
+    if Sid == 1
+        val = reduce_warp(val)
+    end
+    return val
 end
+
+function reduce_warp(val)
+    offset = 0x00000001
+    while offset < CUDA.warpsize()
+
+        val += CUDA.shfl_down_sync(0xffffffff, val, offset)
+        offset <<= 1
+    end
+
+    return val
+end
+
+end
+

From a34ac1ad371f26984f08eb4e9a6714c5e92c158e Mon Sep 17 00:00:00 2001
From: brabreda <bram.bfh@gmail.com>
Date: Tue, 7 Mar 2023 00:36:11 +0100
Subject: [PATCH 2/5] added op to groupreduce

---
 lib/CUDAKernels/src/CUDAKernels.jl | 16 +++++++++-------
 src/KernelAbstractions.jl          | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl
index 85ef781d8..6074f699e 100644
--- a/lib/CUDAKernels/src/CUDAKernels.jl
+++ b/lib/CUDAKernels/src/CUDAKernels.jl
@@ -323,6 +323,8 @@ import CUDA: @device_override
 import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices
 import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
 import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
+import KernelAbstractions: __reduce
+
 
 function mkcontext(kernel::Kernel{<:CUDADevice}, _ndrange, iterspace)
     CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace)
@@ -407,13 +409,14 @@ Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental
 # Argument conversion
 KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(arg)
 
-# reduce block
-@device_override @inline function reduce(T,val)
+
+# group reduce that uses warp level reduction
+@device_override @inline function __reduce(op, val, ::Type{T}) where T
     shared = CUDA.CuStaticSharedArray(T, 32)
 
     Sid, lane = fldmod1(CUDA.threadIdx().x, CUDA.warpsize())
 
-    val = reduce_warp(val)
+    val = reduce_warp(op, val)
 
     # Enkel de eerste lane van iedere SIMD unit mag de waarde naar shared memory schrijven
     if lane == 1
@@ -432,16 +435,15 @@ KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(a
 
     # final reduce within first warp
     if Sid == 1
-        val = reduce_warp(val)
+        val = reduce_warp(op, val)
     end
     return val
 end
 
-function reduce_warp(val)
+@inline function reduce_warp(op, val)
     offset = 0x00000001
     while offset < CUDA.warpsize()
-
-        val += CUDA.shfl_down_sync(0xffffffff, val, offset)
+        val = op(val, CUDA.shfl_down_sync(0xffffffff, val, offset))
         offset <<= 1
     end
 
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 30d1cd75f..c5f9f104d 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -4,6 +4,7 @@ export @kernel
 export @Const, @localmem, @private, @uniform, @synchronize
 export @index, @groupsize, @ndrange
 export @print
+export @reduce
 export Device, GPU, CPU, Event, MultiEvent, NoneEvent
 export async_copy!
 
@@ -329,6 +330,14 @@ macro index(locale, args...)
     Expr(:call, GlobalRef(KernelAbstractions, index_function), esc(:__ctx__), map(esc, args)...)
 end
 
+# TODO: Where should we havdle the logic of neutral, adding it to the macro's logic would reduce complexity in terms of using the macro
+# but adding it to the macro may cause some overhead
+macro reduce(op, val)
+    quote
+        $__reduce($(esc(op)), $(esc(val)), typeof($(esc(val))))
+    end
+end
+
 ###
 # Internal kernel functions
 ###
@@ -493,6 +502,7 @@ function __synchronize()
     error("@synchronize used outside kernel or not captured")
 end
 
+
 @generated function __print(items...)
     str = ""
     args = []
@@ -515,6 +525,14 @@ end
 __size(args::Tuple) = Tuple{args...}
 __size(i::Int) = Tuple{i}
 
+
+# reduction
+function __reduce(op, val, ::Type{T}) where T
+    error("@reduce used outside kernel or not captured")
+end
+
+
+
 ###
 # Extras
 # - LoopInfo

From 66163f459054b689821e57b3696916cf5eaf27fe Mon Sep 17 00:00:00 2001
From: brabreda <bram.bfh@gmail.com>
Date: Wed, 5 Apr 2023 20:12:09 +0200
Subject: [PATCH 3/5] add reduction macro

---
 lib/CUDAKernels/src/CUDAKernels.jl | 61 ++++++++++++++----------------
 src/KernelAbstractions.jl          |  6 +--
 2 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl
index 6074f699e..cc4365762 100644
--- a/lib/CUDAKernels/src/CUDAKernels.jl
+++ b/lib/CUDAKernels/src/CUDAKernels.jl
@@ -409,44 +409,39 @@ Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental
 # Argument conversion
 KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(arg)
 
+# TODO: make variable block size possible
+# TODO: figure out where to place this
 
 # group reduce that uses warp level reduction
-@device_override @inline function __reduce(op, val, ::Type{T}) where T
-    shared = CUDA.CuStaticSharedArray(T, 32)
-
-    Sid, lane = fldmod1(CUDA.threadIdx().x, CUDA.warpsize())
-
-    val = reduce_warp(op, val)
-
-    # Enkel de eerste lane van iedere SIMD unit mag de waarde naar shared memory schrijven
-    if lane == 1
-        @inbounds shared[Sid] = val
-    end
-
-    CUDA.sync_threads()
-
-    # de eerste 32 values worden in val gestoken, als er er minder dan 32 warps passen in 1 block
-    # dan vullen we de rest op met nullen
-    val = if CUDA.threadIdx().x <= fld1(CUDA.blockDim().x, CUDA.warpsize())
-            @inbounds shared[lane]
-    else
-        0
+@device_override @inline function __reduce(__ctx__ , op, val, neutral, ::Type{T}) where T
+    threads = KernelAbstractions.@groupsize()[1]
+    threadIdx = KernelAbstractions.@index(Local)
+
+    # shared mem for a complete reduction
+    shared = KernelAbstractions.@localmem(T, 1024)
+    @inbounds shared[threadIdx] = val
+
+    # perform the reduction
+    d = 1
+    while d < threads
+        KernelAbstractions.@synchronize()
+        index = 2 * d * (threadIdx-1) + 1
+        @inbounds if index <= threads
+            other_val = if index + d <= threads
+                shared[index+d]
+            else
+                neutral
+            end
+            shared[index] = op(shared[index], other_val)
+        end
+        d *= 2
     end
 
-    # final reduce within first warp
-    if Sid == 1
-        val = reduce_warp(op, val)
+    # load the final value on the first thread
+    if threadIdx == 1
+        val = @inbounds shared[threadIdx]
     end
-    return val
-end
-
-@inline function reduce_warp(op, val)
-    offset = 0x00000001
-    while offset < CUDA.warpsize()
-        val = op(val, CUDA.shfl_down_sync(0xffffffff, val, offset))
-        offset <<= 1
-    end
-
+    # every thread will return the reduced value of the group
     return val
 end
 
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index c5f9f104d..2b840acf1 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -332,9 +332,9 @@ end
 
 # TODO: Where should we havdle the logic of neutral, adding it to the macro's logic would reduce complexity in terms of using the macro
 # but adding it to the macro may cause some overhead
-macro reduce(op, val)
+macro reduce(op, val, neutral)
     quote
-        $__reduce($(esc(op)), $(esc(val)), typeof($(esc(val))))
+        $__reduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), typeof($(esc(val))))
     end
 end
 
@@ -531,8 +531,6 @@ function __reduce(op, val, ::Type{T}) where T
     error("@reduce used outside kernel or not captured")
 end
 
-
-
 ###
 # Extras
 # - LoopInfo

From ab3d6d10ef62733a3813bd0cc5cb1a9f84a6a5fd Mon Sep 17 00:00:00 2001
From: brabreda <bram.bfh@gmail.com>
Date: Wed, 5 Apr 2023 20:15:38 +0200
Subject: [PATCH 4/5] add reduction macro

---
 lib/CUDAKernels/src/CUDAKernels.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl
index cc4365762..f78bbec8f 100644
--- a/lib/CUDAKernels/src/CUDAKernels.jl
+++ b/lib/CUDAKernels/src/CUDAKernels.jl
@@ -411,8 +411,7 @@ KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(a
 
 # TODO: make variable block size possible
 # TODO: figure out where to place this
-
-# group reduce that uses warp level reduction
+# reduction functionality for a group
 @device_override @inline function __reduce(__ctx__ , op, val, neutral, ::Type{T}) where T
     threads = KernelAbstractions.@groupsize()[1]
     threadIdx = KernelAbstractions.@index(Local)

From db024ed7790eb7e521c971f55bcbd7c57045d061 Mon Sep 17 00:00:00 2001
From: brabreda <bram.bfh@gmail.com>
Date: Wed, 12 Apr 2023 01:01:53 +0200
Subject: [PATCH 5/5] added reduce file

---
 lib/CUDAKernels/src/CUDAKernels.jl | 44 +++++--------------------
 src/KernelAbstractions.jl          | 17 ++++++++--
 src/reduce.jl                      | 52 ++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 39 deletions(-)
 create mode 100644 src/reduce.jl

diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl
index f78bbec8f..d193c002a 100644
--- a/lib/CUDAKernels/src/CUDAKernels.jl
+++ b/lib/CUDAKernels/src/CUDAKernels.jl
@@ -323,7 +323,6 @@ import CUDA: @device_override
 import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices
 import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
 import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
-import KernelAbstractions: __reduce
 
 
 function mkcontext(kernel::Kernel{<:CUDADevice}, _ndrange, iterspace)
@@ -400,6 +399,14 @@ end
     CUDA._cuprint(args...)
 end
 
+import KernelAbstractions: __test
+
+@device_override @inline function __test(__ctx__, conf) 
+    KernelAbstractions.@localmem Float64 conf.threads_per_block
+
+    KernelAbstractions.@print("dit werkt")
+end
+
 ###
 # GPU implementation of const memory
 ###
@@ -409,40 +416,5 @@ Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental
 # Argument conversion
 KernelAbstractions.argconvert(k::Kernel{<:CUDADevice}, arg) = CUDA.cudaconvert(arg)
 
-# TODO: make variable block size possible
-# TODO: figure out where to place this
-# reduction functionality for a group
-@device_override @inline function __reduce(__ctx__ , op, val, neutral, ::Type{T}) where T
-    threads = KernelAbstractions.@groupsize()[1]
-    threadIdx = KernelAbstractions.@index(Local)
-
-    # shared mem for a complete reduction
-    shared = KernelAbstractions.@localmem(T, 1024)
-    @inbounds shared[threadIdx] = val
-
-    # perform the reduction
-    d = 1
-    while d < threads
-        KernelAbstractions.@synchronize()
-        index = 2 * d * (threadIdx-1) + 1
-        @inbounds if index <= threads
-            other_val = if index + d <= threads
-                shared[index+d]
-            else
-                neutral
-            end
-            shared[index] = op(shared[index], other_val)
-        end
-        d *= 2
-    end
-
-    # load the final value on the first thread
-    if threadIdx == 1
-        val = @inbounds shared[threadIdx]
-    end
-    # every thread will return the reduced value of the group
-    return val
-end
-
 end
 
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 2b840acf1..7db68c362 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -330,14 +330,18 @@ macro index(locale, args...)
     Expr(:call, GlobalRef(KernelAbstractions, index_function), esc(:__ctx__), map(esc, args)...)
 end
 
-# TODO: Where should we havdle the logic of neutral, adding it to the macro's logic would reduce complexity in terms of using the macro
-# but adding it to the macro may cause some overhead
 macro reduce(op, val, neutral)
     quote
         $__reduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), typeof($(esc(val))))
     end
 end
 
+macro test(conf)
+    quote
+        $__test($(esc(:__ctx__)),$(esc(conf)))
+    end
+end
+
 ###
 # Internal kernel functions
 ###
@@ -527,19 +531,26 @@ __size(i::Int) = Tuple{i}
 
 
 # reduction
-function __reduce(op, val, ::Type{T}) where T
+function __reduce(__ctx__, op, val, ::Type{T}) where T
     error("@reduce used outside kernel or not captured")
 end
 
+function __test(__ctx__, conf)
+    error("@test used outside kernel or not captured")
+end
+
 ###
 # Extras
 # - LoopInfo
 ###
 
+
 include("extras/extras.jl")
 
 include("reflection.jl")
 
+include("reduce.jl")
+
 # CPU backend
 
 include("cpu.jl")
diff --git a/src/reduce.jl b/src/reduce.jl
new file mode 100644
index 000000000..71b19a4a2
--- /dev/null
+++ b/src/reduce.jl
@@ -0,0 +1,52 @@
+struct Config{
+    THREADS_PER_WARP,         # size of warp 
+    THREADS_PER_BLOCK         # size of blocks
+   }
+end
+
+@inline function Base.getproperty(conf::Type{Config{ THREADS_PER_WARP, THREADS_PER_BLOCK}}, sym::Symbol) where { THREADS_PER_WARP, THREADS_PER_BLOCK}
+    if sym == :threads_per_warp
+        THREADS_PER_WARP
+    elseif sym == :threads_per_block
+        THREADS_PER_BLOCK
+    else
+        # fallback for nothing
+        getfield(conf, sym)
+    end
+end
+
+# TODO: make variable block size possible
+# TODO: figure out where to place this
+# reduction functionality for a group
+@inline function __reduce(__ctx__ , op, val, neutral, ::Type{T}) where {T}
+    threads = KernelAbstractions.@groupsize()[1]
+    threadIdx = KernelAbstractions.@index(Local)
+
+    # shared mem for a complete reduction
+    shared = KernelAbstractions.@localmem(T, 1024)
+    @inbounds shared[threadIdx] = val
+
+    # perform the reduction
+    d = 1
+    while d < threads
+        KernelAbstractions.@synchronize()
+        index = 2 * d * (threadIdx-1) + 1
+        @inbounds if index <= threads
+            other_val = if index + d <= threads
+                shared[index+d]
+            else
+                neutral
+            end
+            shared[index] = op(shared[index], other_val)
+        end
+        d *= 2
+    end
+
+    # load the final value on the first thread
+    if threadIdx == 1
+        val = @inbounds shared[threadIdx]
+    end
+
+    # every thread will return the reduced value of the group
+    return val
+end