protoyping

JuliaHealth · Sep 26, 2021 · a52ab90 · a52ab90
1 parent a50e48d
commit a52ab90
Show file tree

Hide file tree

Showing 10 changed files with 707 additions and 87 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -4,6 +4,7 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
+
         {
             "type": "julia",
             "request": "launch",

diff --git a/Manifest.toml b/Manifest.toml
@@ -440,9 +440,9 @@ version = "2.1.2"
 
 [[MacroTools]]
 deps = ["Markdown", "Random"]
-git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7"
+git-tree-sha1 = "5a5bc6bf062f0f95e62d0fe0a2d99699fed82dd9"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.7"
+version = "0.5.8"
 
 [[Markdown]]
 deps = ["Base64"]

diff --git a/Project.toml b/Project.toml
@@ -11,6 +11,7 @@ CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 InfoZIP = "f4508453-b816-52ab-a864-26fc7f6211fc"
+MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 MedEye3d = "48a1af7b-3279-4eeb-8f2b-7ca229bb51b1"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"

diff --git a/src/distanceMetrics/Housdorff/mainHouseDorffKernel/ProcessMainData.jl b/src/distanceMetrics/Housdorff/mainHouseDorffKernel/ProcessMainData.jl
@@ -5,8 +5,8 @@ loads and do the main processing of data in arrays of intrest (padding of shmem
 
 """
 module ProcessMainData
-
-export executeDataIterFirstPass,executeDataIterOtherPasses
+using  StaticArrays,Main.CUDAGpuUtils ,Main.HFUtils, CUDA
+export executeDataIterFirstPass,executeDataIterOtherPasses,processMaskData
 
 
 """
@@ -22,27 +22,41 @@ resArray- 3 dimensional array where we put results
 """
 function executeDataIterFirstPass(analyzedArr, refAray,blockBeginingX,blockBeginingY,blockBeginingZ,isMaskFull,isMaskEmpty,resShmem,locArr,resArray)
     @unroll for zIter in UInt16(1):32# most outer loop is responsible for z dimension
-        processMaskData( analyzedArr[x,y,z+zIter], zIter, resShmem,locArr)
-    end#for 
-    sync_threads() #we should have in resShmem what we need 
-    @unroll for zIter in UInt16(1):32 # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately
-        validataDataFirstPass(locArr[32],resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1],resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter)
-    end#for
-end#executeDataIter
-
-"""
-specializes executeDataIterFirstPass as it do not consider possibility of block being empty 
-"""
-function executeDataIterOtherPasses(analyzedArr, refAray,iterationNumber ,blockBeginingX,blockBeginingY,blockBeginingZ,isMaskFull,resShmem,locArr,resArray,resArraysCounter)
-    @unroll for zIter in UInt16(1):32# most outer loop is responsible for z dimension
-        processMaskData( analyzedArr[x,y,z+zIter], zIter, resShmem,locArr )
+        local locBool = testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]
+        locArr|= locBool << zIter
+       processMaskData( locBool, zIter, resShmem)
+       CUDA.unsafe_free!(locBool)
     end#for 
     sync_threads() #we should have in resShmem what we need 
     @unroll for zIter in UInt16(1):32 # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately
-        validataData(locArr[32],resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1],32,resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter)
+          local locBoolRegister = (locArr>>zIter & UInt32(1))==UInt32(1)
+          local locBoolShmem = resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1]
+        validataDataFirstPass(locBoolRegister,locBoolShmem,resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter,zIter)
     end#for
 end#executeDataIter
 
+# """
+# specializes executeDataIterFirstPass as it do not consider possibility of block being empty 
+# """
+# function executeDataIterOtherPasses(analyzedArr, refAray,iterationNumber ,blockBeginingX,blockBeginingY,blockBeginingZ,isMaskFull,resShmem,locArr,resArray,resArraysCounter)
+#     @unroll for zIter in UInt16(1):32# most outer loop is responsible for z dimension
+#         local locBool = testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]
+#         locArr|= locBool << zIter
+#        processMaskData( locBool, zIter, resShmem)
+#        CUDA.unsafe_free!(locBool)
+#     end#for 
+#     sync_threads() #we should have in resShmem what we need 
+#     @unroll for zIter in UInt16(1):32 # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately
+#          local locBoolRegister = (locArr>>zIter & UInt32(1))==1
+#          local locBoolShmem = resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1]
+#          validataDataOtherPass(locBoolRegister,locBoolShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter)
+#        CUDA.unsafe_free!(locBool)
+
+
+#     end#for
+# end#executeDataIter
+
+#numb>>1 & UInt32(1)
 
 
 
@@ -54,25 +68,30 @@ uploaded data from shared memory in amask of intrest gets processed in this func
             - also we need to make sure that in corner cases we are getting to correct spot
 """
 function processMaskData(maskBool::Bool
-                         ,zIter::UInt16
+                         ,zIter::UInt8
                          ,resShmem
-                         ,locArr )
+                          ) #::CUDA.CuRefValue{Int32}
     # save it to registers - we will need it later
-    locArr[zIter]=maskBool
+    #locArr[zIter]=maskBool
     #now we are saving results evrywhere we are intrested in so around without diagonals (we use supremum norm instead of euclidean)
+    #locArr.x|= maskBool << zIter
     if(maskBool)
-        resShmem[threadIdxX()+1,threadIdxY()+1,zIter]=true #up
-        resShmem[threadIdxX()+1,threadIdxY()+1,zIter+2]=true #down
+        @inbounds resShmem[threadIdxX()+1,threadIdxY()+1,zIter]=true #up
+        @inbounds  resShmem[threadIdxX()+1,threadIdxY()+1,zIter+2]=true #down
 
-        resShmem[threadIdxX(),threadIdxY()+1,zIter+1]=true #left
-        resShmem[threadIdxX()+2,threadIdxY()+1,zIter+1]=true #right
+        @inbounds  resShmem[threadIdxX(),threadIdxY()+1,zIter+1]=true #left
+        @inbounds   resShmem[threadIdxX()+2,threadIdxY()+1,zIter+1]=true #right
 
-        resShmem[threadIdxX()+1,threadIdxY()+2,zIter+1]=true #front
-        resShmem[threadIdxX()+1,threadIdxY(),zIter+1]=true #back
+        @inbounds  resShmem[threadIdxX()+1,threadIdxY()+2,zIter+1]=true #front
+        @inbounds  resShmem[threadIdxX()+1,threadIdxY(),zIter+1]=true #back
     end#if    
-
 end#processMaskData
 
+function myIncreaseBitt(maskBool::Bool,zIter::UInt8, locArr::CUDA.CuRefValue{Int32} )::Bool
+    locArr.x|= maskBool << zIter
+    return true
+end    
+
 
 """
 -so we uploaded all data that we consider new - around voxels that are "true"  but we can be sure that some of those were already true earlier 
@@ -88,54 +107,56 @@ locVal - value from registers
 shmemVal - value associated with this thread from shared memory - where we marked neighbours ...
 resShmem - shared memory with our preliminary results
 isMaskFull, isMaskEmpty - register values needed to specify weather we have full or empty or neither block
-x,y,z - needed to access data from main data array in global memory
+blockBeginingX,blockBeginingY,blockBeginingZ - coordinates where our block is begining - will be used as offset by our threads
 masktoUpdate - mask that we analyzed and now we write to data about dilatation
 maskToCompare - the other mask that we need to check before we write to result array
-iterationNumber - in which iteration we are currently - the bigger it is the higher housedorfrf,,
-
+resArray 
         """
 function validataDataFirstPass(locVal::Bool
                     ,shmemVal::Bool
-                     ,resShmem
-                    ,isMaskFull::MVector{1, Bool}
-                    ,isMaskEmpty::MVector{1, Bool}
-                    ,x::UInt16
-                    ,y::UInt16
-                    ,z::UInt16
+                    ,isMaskFull#::CUDA.CuRefValue{Bool}
+                    ,isMaskEmpty#::CUDA.CuRefValue{Bool}
+                    ,blockBeginingX::UInt8
+                    ,blockBeginingY::UInt8
+                    ,blockBeginingZ::UInt8
                     ,maskToCompare
                     ,masktoUpdate
                     ,resArray
-                    ,resArraysCounter)
+                    ,resArraysCounter
+                    ,zIter::UInt8)::Bool
     #when this one and previous is true it will still be true
-    setIsFullOrEmpty!((locVal | shmemVal),isMaskFull,isMaskEmpty  )
+    locValOrShmem = (locVal | shmemVal)
+    isMaskFull.x= locValOrShmem & isMaskFull.x
+    isMaskEmpty.x = ~locValOrShmem & isMaskEmpty.x
+
   if(!locVal && shmemVal)
     # setting value in global memory
-    masktoUpdate[x,y,z+32]= true
+    @inbounds  masktoUpdate[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]= true
     # if we are here we have some voxel that was false in a primary mask and is becoming now true - if it is additionaly true in reference we need to add it to result
-    if(maskToCompare[x,y,z+32])
-       resArray[x,y,z+32]=UInt16(1)
-       CUDA.atomic_inc!(pointer(resArraysCounter), UInt16(1))
+    if(maskToCompare[@inbounds  (blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)])
+        @inbounds  resArray[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]=UInt16(1)
+
+       atomicallyAddOneUint32(resArraysCounter)
     end#if
   end#if
-
+return true
 end
 
 """
 specializes validataDataFirstPass ignoring case of potentially empty mask
+iterationNumber - in which iteration we are currently - the bigger it is the higher housedorfrf,,
+
 """
 function validataDataOtherPass(locVal::Bool
                                 ,shmemVal::Bool
-                                ,resShmem
-                                ,isMaskFull::MVector{1, Bool}
                                 ,isMaskEmpty::MVector{1, Bool}
-                                ,x::UInt16
-                                ,y::UInt16
-                                ,z::UInt16
+                                ,blockBeginingX,blockBeginingY,blockBeginingZ
                                 ,maskToCompare
                                 ,masktoUpdate
                                 ,resArray
                                 ,iterationNumber::UInt16
-                                ,resArraysCounter)
+                                ,resArraysCounter
+                                ,zIter)
     #when this one and previous is true it will still be true
     setIsFull!((locVal | shmemVal),isMaskEmpty  )
     if(!locVal && shmemVal)
@@ -153,18 +174,6 @@ end
 
 
 
-"""
-set the isMaskFull and isMaskEmpty
-locVal - value of the voxel from registry (what was loaded from globl memory and not modified)
-shmemVal - what was loaded in shared memory - so after dilatation
-yet we pass into locValOrShmem (locVal | shmemVal)
-"""
-function setIsFullOrEmpty!(locValOrShmem::Bool
-                        ,isMaskFull::MVector{1, Bool}
-                        ,isMaskEmpty::MVector{1, Bool} )
-    isMaskFull[1]= locValOrShmem & isMaskFull[1]
-    isMaskEmpty[1] = ~locValOrShmem & isMaskEmpty[1]
-end#setIsFullOrEmpty
 
 
 function setIsFull!(locValOrShmem::Bool

diff --git a/src/utils/CUDAGpuUtils.jl b/src/utils/CUDAGpuUtils.jl
@@ -3,9 +3,20 @@ module CUDAGpuUtils
 
 using CUDA, StaticArrays
 
-export clearLocArrdefineIndicies,computeBlocksFromOccupancy,reduce_warp,getKernelContants,assignWorkToCooperativeBlocks,getMaxBlocksPerMultiproc,reduce_warp_max,reduce_warp_min,reduce_warp_min,reduce_warp_or,reduce_warp_and,blockIdxZ,blockIdxY,blockIdxX,blockDimZ, blockDimY, blockDimX, threadIdxX, threadIdxY, threadIdxZ
+export atomicallyAddOneUint,clearLocArrdefineIndicies,computeBlocksFromOccupancy,reduce_warp,getKernelContants,assignWorkToCooperativeBlocks,getMaxBlocksPerMultiproc,reduce_warp_max,reduce_warp_min,reduce_warp_min,reduce_warp_or,reduce_warp_and,blockIdxZ,blockIdxY,blockIdxX,blockDimZ, blockDimY, blockDimX, threadIdxX, threadIdxY, threadIdxZ
 export @unroll, @ifX, @ifY, @ifXY
 
+
+
+"""
+atomically add to given 1 length array 1
+data type need to be UInt32
+"""
+function atomicallyAddOneUint(arr)
+    CUDA.atomic_inc!(pointer(arr), UInt16(1))
+end
+
+
 """
 convinience macro that will execute only if it has given thread Id X
 """
@@ -334,4 +345,15 @@ end #assignWorkToCooperativeBlocks
 
 
 
+
+
+
+
+
+
+
+
+
+
+
 end #CUDAGpuUtils
diff --git a/test/GPUtestUtils.jl b/test/GPUtestUtils.jl
@@ -0,0 +1,42 @@
+
+
+function fillGlobalFromShmem(testArrInn,resShmem)
+    for z in 1:34   
+        testArrInn[threadIdxX()+1,threadIdxY()+1,z ]=resShmem[threadIdxX()+1,threadIdxY()+1,z ]
+    end    
+
+        testArrInn[1,threadIdxX()+1,threadIdxY()+1]=  resShmem[1,threadIdxX()+1,threadIdxY()+1]
+        testArrInn[34,threadIdxX()+1,threadIdxY()+1]=  resShmem[34,threadIdxX()+1,threadIdxY()+1]
+        testArrInn[threadIdxX()+1,1,threadIdxY()+1]=  resShmem[threadIdxX()+1,1,threadIdxY()+1]
+        testArrInn[threadIdxX()+1,34,threadIdxY()+1]=  resShmem[threadIdxX()+1,34,threadIdxY()+1]
+
+
+end
+
+
+function getIndiciesWithTrue(arr)
+    indicies = CartesianIndices(arr)
+    return filter(ind-> arr[ind] ,indicies)
+
+
+end
+
+
+# function fillGlobalFromShmem(testArrInn,resShmem)
+#     for z in 1:34   
+#         testArrInn[threadIdxX()+1,threadIdxY()+1,z ]=resShmem[threadIdxX()+1,threadIdxY()+1,z ]
+#         sync_threads()
+#     end    
+
+#     for z in 1:32
+#         testArrInn[threadIdxX()+1,threadIdxY()+2,z+1 ]=  resShmem[threadIdxX()+1,threadIdxY()+2,z+1 ]
+#         sync_threads()
+#         testArrInn[threadIdxX()+1,threadIdxY(),z+1 ]=  resShmem[threadIdxX()+1,threadIdxY()+2,z+1 ]
+#         sync_threads()
+#         testArrInn[threadIdxX()+2,threadIdxY()+1,z+1 ]=  resShmem[threadIdxX()+2,threadIdxY()+1,z+1 ]
+#         sync_threads()
+#         testArrInn[threadIdxX(),threadIdxY()+1,z+1 ]=  resShmem[threadIdxX(),threadIdxY()+1,z+1 ]
+#         sync_threads()
+#     end
+
+# end