From a52ab90d5e353795be9da8e1be0eda0a5e3e6130 Mon Sep 17 00:00:00 2001 From: Jakub Mitura <53857487+jakubMitura14@users.noreply.github.com> Date: Sun, 26 Sep 2021 19:23:36 +0200 Subject: [PATCH] protoyping --- .vscode/launch.json | 1 + Manifest.toml | 4 +- Project.toml | 1 + .../mainHouseDorffKernel/ProcessMainData.jl | 131 ++++---- src/utils/CUDAGpuUtils.jl | 24 +- test/GPUtestUtils.jl | 42 +++ test/aPrfofiling/profilingProcessMaskData.jl | 139 +++++++++ .../distanceMetr/HousdorffTest/HFUtilsTest.jl | 99 ++++-- .../HousdorffTest/ProcessMainDataTest.jl | 295 ++++++++++++++++++ .../HousdorffTest/forDebugSeparate.jl | 58 ++++ 10 files changed, 707 insertions(+), 87 deletions(-) create mode 100644 test/GPUtestUtils.jl create mode 100644 test/aPrfofiling/profilingProcessMaskData.jl create mode 100644 test/distanceMetr/HousdorffTest/ProcessMainDataTest.jl create mode 100644 test/distanceMetr/HousdorffTest/forDebugSeparate.jl diff --git a/.vscode/launch.json b/.vscode/launch.json index c043748..7728969 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,6 +4,7 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ + { "type": "julia", "request": "launch", diff --git a/Manifest.toml b/Manifest.toml index 161832b..a2e0ff9 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -440,9 +440,9 @@ version = "2.1.2" [[MacroTools]] deps = ["Markdown", "Random"] -git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7" +git-tree-sha1 = "5a5bc6bf062f0f95e62d0fe0a2d99699fed82dd9" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.7" +version = "0.5.8" [[Markdown]] deps = ["Base64"] diff --git a/Project.toml b/Project.toml index d57e5e7..7f41c7c 100644 --- a/Project.toml +++ b/Project.toml @@ -11,6 +11,7 @@ CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" InfoZIP = "f4508453-b816-52ab-a864-26fc7f6211fc" +MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" MedEye3d = "48a1af7b-3279-4eeb-8f2b-7ca229bb51b1" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" diff --git a/src/distanceMetrics/Housdorff/mainHouseDorffKernel/ProcessMainData.jl b/src/distanceMetrics/Housdorff/mainHouseDorffKernel/ProcessMainData.jl index bc448a6..b2cb49a 100644 --- a/src/distanceMetrics/Housdorff/mainHouseDorffKernel/ProcessMainData.jl +++ b/src/distanceMetrics/Housdorff/mainHouseDorffKernel/ProcessMainData.jl @@ -5,8 +5,8 @@ loads and do the main processing of data in arrays of intrest (padding of shmem """ module ProcessMainData - -export executeDataIterFirstPass,executeDataIterOtherPasses +using StaticArrays,Main.CUDAGpuUtils ,Main.HFUtils, CUDA +export executeDataIterFirstPass,executeDataIterOtherPasses,processMaskData """ @@ -22,27 +22,41 @@ resArray- 3 dimensional array where we put results """ function executeDataIterFirstPass(analyzedArr, refAray,blockBeginingX,blockBeginingY,blockBeginingZ,isMaskFull,isMaskEmpty,resShmem,locArr,resArray) @unroll for zIter in UInt16(1):32# most outer loop is responsible for z dimension - processMaskData( analyzedArr[x,y,z+zIter], zIter, resShmem,locArr) - end#for - sync_threads() #we should have in resShmem what we need - @unroll for zIter in UInt16(1):32 # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately - validataDataFirstPass(locArr[32],resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1],resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter) - end#for -end#executeDataIter - -""" -specializes executeDataIterFirstPass as it do not consider possibility of block being empty -""" -function executeDataIterOtherPasses(analyzedArr, refAray,iterationNumber ,blockBeginingX,blockBeginingY,blockBeginingZ,isMaskFull,resShmem,locArr,resArray,resArraysCounter) - @unroll for zIter in UInt16(1):32# most outer loop is responsible for z dimension - processMaskData( analyzedArr[x,y,z+zIter], zIter, resShmem,locArr ) + local locBool = testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)] + locArr|= locBool << zIter + processMaskData( locBool, zIter, resShmem) + CUDA.unsafe_free!(locBool) end#for sync_threads() #we should have in resShmem what we need @unroll for zIter in UInt16(1):32 # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately - validataData(locArr[32],resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1],32,resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter) + local locBoolRegister = (locArr>>zIter & UInt32(1))==UInt32(1) + local locBoolShmem = resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1] + validataDataFirstPass(locBoolRegister,locBoolShmem,resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter,zIter) end#for end#executeDataIter +# """ +# specializes executeDataIterFirstPass as it do not consider possibility of block being empty +# """ +# function executeDataIterOtherPasses(analyzedArr, refAray,iterationNumber ,blockBeginingX,blockBeginingY,blockBeginingZ,isMaskFull,resShmem,locArr,resArray,resArraysCounter) +# @unroll for zIter in UInt16(1):32# most outer loop is responsible for z dimension +# local locBool = testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)] +# locArr|= locBool << zIter +# processMaskData( locBool, zIter, resShmem) +# CUDA.unsafe_free!(locBool) +# end#for +# sync_threads() #we should have in resShmem what we need +# @unroll for zIter in UInt16(1):32 # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately +# local locBoolRegister = (locArr>>zIter & UInt32(1))==1 +# local locBoolShmem = resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1] +# validataDataOtherPass(locBoolRegister,locBoolShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter) +# CUDA.unsafe_free!(locBool) + + +# end#for +# end#executeDataIter + +#numb>>1 & UInt32(1) @@ -54,25 +68,30 @@ uploaded data from shared memory in amask of intrest gets processed in this func - also we need to make sure that in corner cases we are getting to correct spot """ function processMaskData(maskBool::Bool - ,zIter::UInt16 + ,zIter::UInt8 ,resShmem - ,locArr ) + ) #::CUDA.CuRefValue{Int32} # save it to registers - we will need it later - locArr[zIter]=maskBool + #locArr[zIter]=maskBool #now we are saving results evrywhere we are intrested in so around without diagonals (we use supremum norm instead of euclidean) + #locArr.x|= maskBool << zIter if(maskBool) - resShmem[threadIdxX()+1,threadIdxY()+1,zIter]=true #up - resShmem[threadIdxX()+1,threadIdxY()+1,zIter+2]=true #down + @inbounds resShmem[threadIdxX()+1,threadIdxY()+1,zIter]=true #up + @inbounds resShmem[threadIdxX()+1,threadIdxY()+1,zIter+2]=true #down - resShmem[threadIdxX(),threadIdxY()+1,zIter+1]=true #left - resShmem[threadIdxX()+2,threadIdxY()+1,zIter+1]=true #right + @inbounds resShmem[threadIdxX(),threadIdxY()+1,zIter+1]=true #left + @inbounds resShmem[threadIdxX()+2,threadIdxY()+1,zIter+1]=true #right - resShmem[threadIdxX()+1,threadIdxY()+2,zIter+1]=true #front - resShmem[threadIdxX()+1,threadIdxY(),zIter+1]=true #back + @inbounds resShmem[threadIdxX()+1,threadIdxY()+2,zIter+1]=true #front + @inbounds resShmem[threadIdxX()+1,threadIdxY(),zIter+1]=true #back end#if - end#processMaskData +function myIncreaseBitt(maskBool::Bool,zIter::UInt8, locArr::CUDA.CuRefValue{Int32} )::Bool + locArr.x|= maskBool << zIter + return true +end + """ -so we uploaded all data that we consider new - around voxels that are "true" but we can be sure that some of those were already true earlier @@ -88,54 +107,56 @@ locVal - value from registers shmemVal - value associated with this thread from shared memory - where we marked neighbours ... resShmem - shared memory with our preliminary results isMaskFull, isMaskEmpty - register values needed to specify weather we have full or empty or neither block -x,y,z - needed to access data from main data array in global memory +blockBeginingX,blockBeginingY,blockBeginingZ - coordinates where our block is begining - will be used as offset by our threads masktoUpdate - mask that we analyzed and now we write to data about dilatation maskToCompare - the other mask that we need to check before we write to result array -iterationNumber - in which iteration we are currently - the bigger it is the higher housedorfrf,, - +resArray """ function validataDataFirstPass(locVal::Bool ,shmemVal::Bool - ,resShmem - ,isMaskFull::MVector{1, Bool} - ,isMaskEmpty::MVector{1, Bool} - ,x::UInt16 - ,y::UInt16 - ,z::UInt16 + ,isMaskFull#::CUDA.CuRefValue{Bool} + ,isMaskEmpty#::CUDA.CuRefValue{Bool} + ,blockBeginingX::UInt8 + ,blockBeginingY::UInt8 + ,blockBeginingZ::UInt8 ,maskToCompare ,masktoUpdate ,resArray - ,resArraysCounter) + ,resArraysCounter + ,zIter::UInt8)::Bool #when this one and previous is true it will still be true - setIsFullOrEmpty!((locVal | shmemVal),isMaskFull,isMaskEmpty ) + locValOrShmem = (locVal | shmemVal) + isMaskFull.x= locValOrShmem & isMaskFull.x + isMaskEmpty.x = ~locValOrShmem & isMaskEmpty.x + if(!locVal && shmemVal) # setting value in global memory - masktoUpdate[x,y,z+32]= true + @inbounds masktoUpdate[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]= true # if we are here we have some voxel that was false in a primary mask and is becoming now true - if it is additionaly true in reference we need to add it to result - if(maskToCompare[x,y,z+32]) - resArray[x,y,z+32]=UInt16(1) - CUDA.atomic_inc!(pointer(resArraysCounter), UInt16(1)) + if(maskToCompare[@inbounds (blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]) + @inbounds resArray[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]=UInt16(1) + + atomicallyAddOneUint32(resArraysCounter) end#if end#if - +return true end """ specializes validataDataFirstPass ignoring case of potentially empty mask +iterationNumber - in which iteration we are currently - the bigger it is the higher housedorfrf,, + """ function validataDataOtherPass(locVal::Bool ,shmemVal::Bool - ,resShmem - ,isMaskFull::MVector{1, Bool} ,isMaskEmpty::MVector{1, Bool} - ,x::UInt16 - ,y::UInt16 - ,z::UInt16 + ,blockBeginingX,blockBeginingY,blockBeginingZ ,maskToCompare ,masktoUpdate ,resArray ,iterationNumber::UInt16 - ,resArraysCounter) + ,resArraysCounter + ,zIter) #when this one and previous is true it will still be true setIsFull!((locVal | shmemVal),isMaskEmpty ) if(!locVal && shmemVal) @@ -153,18 +174,6 @@ end -""" -set the isMaskFull and isMaskEmpty -locVal - value of the voxel from registry (what was loaded from globl memory and not modified) -shmemVal - what was loaded in shared memory - so after dilatation -yet we pass into locValOrShmem (locVal | shmemVal) -""" -function setIsFullOrEmpty!(locValOrShmem::Bool - ,isMaskFull::MVector{1, Bool} - ,isMaskEmpty::MVector{1, Bool} ) - isMaskFull[1]= locValOrShmem & isMaskFull[1] - isMaskEmpty[1] = ~locValOrShmem & isMaskEmpty[1] -end#setIsFullOrEmpty function setIsFull!(locValOrShmem::Bool diff --git a/src/utils/CUDAGpuUtils.jl b/src/utils/CUDAGpuUtils.jl index e7bc2a5..4606968 100644 --- a/src/utils/CUDAGpuUtils.jl +++ b/src/utils/CUDAGpuUtils.jl @@ -3,9 +3,20 @@ module CUDAGpuUtils using CUDA, StaticArrays -export clearLocArrdefineIndicies,computeBlocksFromOccupancy,reduce_warp,getKernelContants,assignWorkToCooperativeBlocks,getMaxBlocksPerMultiproc,reduce_warp_max,reduce_warp_min,reduce_warp_min,reduce_warp_or,reduce_warp_and,blockIdxZ,blockIdxY,blockIdxX,blockDimZ, blockDimY, blockDimX, threadIdxX, threadIdxY, threadIdxZ +export atomicallyAddOneUint,clearLocArrdefineIndicies,computeBlocksFromOccupancy,reduce_warp,getKernelContants,assignWorkToCooperativeBlocks,getMaxBlocksPerMultiproc,reduce_warp_max,reduce_warp_min,reduce_warp_min,reduce_warp_or,reduce_warp_and,blockIdxZ,blockIdxY,blockIdxX,blockDimZ, blockDimY, blockDimX, threadIdxX, threadIdxY, threadIdxZ export @unroll, @ifX, @ifY, @ifXY + + +""" +atomically add to given 1 length array 1 +data type need to be UInt32 +""" +function atomicallyAddOneUint(arr) + CUDA.atomic_inc!(pointer(arr), UInt16(1)) +end + + """ convinience macro that will execute only if it has given thread Id X """ @@ -334,4 +345,15 @@ end #assignWorkToCooperativeBlocks + + + + + + + + + + + end #CUDAGpuUtils \ No newline at end of file diff --git a/test/GPUtestUtils.jl b/test/GPUtestUtils.jl new file mode 100644 index 0000000..baca9af --- /dev/null +++ b/test/GPUtestUtils.jl @@ -0,0 +1,42 @@ + + +function fillGlobalFromShmem(testArrInn,resShmem) + for z in 1:34 + testArrInn[threadIdxX()+1,threadIdxY()+1,z ]=resShmem[threadIdxX()+1,threadIdxY()+1,z ] + end + + testArrInn[1,threadIdxX()+1,threadIdxY()+1]= resShmem[1,threadIdxX()+1,threadIdxY()+1] + testArrInn[34,threadIdxX()+1,threadIdxY()+1]= resShmem[34,threadIdxX()+1,threadIdxY()+1] + testArrInn[threadIdxX()+1,1,threadIdxY()+1]= resShmem[threadIdxX()+1,1,threadIdxY()+1] + testArrInn[threadIdxX()+1,34,threadIdxY()+1]= resShmem[threadIdxX()+1,34,threadIdxY()+1] + + +end + + +function getIndiciesWithTrue(arr) + indicies = CartesianIndices(arr) + return filter(ind-> arr[ind] ,indicies) + + +end + + +# function fillGlobalFromShmem(testArrInn,resShmem) +# for z in 1:34 +# testArrInn[threadIdxX()+1,threadIdxY()+1,z ]=resShmem[threadIdxX()+1,threadIdxY()+1,z ] +# sync_threads() +# end + +# for z in 1:32 +# testArrInn[threadIdxX()+1,threadIdxY()+2,z+1 ]= resShmem[threadIdxX()+1,threadIdxY()+2,z+1 ] +# sync_threads() +# testArrInn[threadIdxX()+1,threadIdxY(),z+1 ]= resShmem[threadIdxX()+1,threadIdxY()+2,z+1 ] +# sync_threads() +# testArrInn[threadIdxX()+2,threadIdxY()+1,z+1 ]= resShmem[threadIdxX()+2,threadIdxY()+1,z+1 ] +# sync_threads() +# testArrInn[threadIdxX(),threadIdxY()+1,z+1 ]= resShmem[threadIdxX(),threadIdxY()+1,z+1 ] +# sync_threads() +# end + +# end \ No newline at end of file diff --git a/test/aPrfofiling/profilingProcessMaskData.jl b/test/aPrfofiling/profilingProcessMaskData.jl new file mode 100644 index 0000000..a3d537b --- /dev/null +++ b/test/aPrfofiling/profilingProcessMaskData.jl @@ -0,0 +1,139 @@ + + #nv-nsight-cu-cli --mode=launch julia +# using Test, Revise + +# includet("C:\\GitHub\\GitHub\\NuclearMedEval\\test\\aPrfofiling\\profilingProcessMaskData.jl") + +# CUDA.@profile wrapForProfile() + + + + +using Test, Revise + +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\utils\\CUDAGpuUtils.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\HFUtils.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\ProcessMainData.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\test\\GPUtestUtils.jl") + +using Main.HFUtils, Main.ProcessMainData,CUDA,Main.CUDAGpuUtils,StaticArrays + + + + +function processMaskDataB(maskBool::Bool + ,zIter::UInt8 + ,resShmem + ,locArr::UInt32 ) +# save it to registers - we will need it later + +locArr|= maskBool << zIter + +#now we are saving results evrywhere we are intrested in so around without diagonals (we use supremum norm instead of euclidean) +if(maskBool) +resShmem[threadIdxX()+1,threadIdxY()+1,zIter]=true #up +resShmem[threadIdxX()+1,threadIdxY()+1,zIter+2]=true #down + +resShmem[threadIdxX(),threadIdxY()+1,zIter+1]=true #left +resShmem[threadIdxX()+2,threadIdxY()+1,zIter+1]=true #right + +resShmem[threadIdxX()+1,threadIdxY()+2,zIter+1]=true #front +resShmem[threadIdxX()+1,threadIdxY(),zIter+1]=true #back +end#if +return true +end#processMaskData + + + +function wrapForProfile() + + testArrIn = CUDA.ones(Bool,32,32,32); + testArrOut = CUDA.zeros(Bool,34,34,34); + + function testKernprocessMaskDataB(testArrInn,testArrOut) + resShmem = @cuStaticSharedMem(Bool,(34,34,34))#+2 in order to get the one padding + clearMainShmem(resShmem) + blockBeginingX=UInt8(0) + blockBeginingY=UInt8(0) + blockBeginingZ=UInt8(0) + isMaskFull= false + isMaskEmpty= false + #here we will store in registers data uploaded from mask for later verification wheather we should send it or not + locArr= UInt32(0) + + @unroll for zIter in UInt8(1):UInt8(32)# most outer loop is responsible for z dimension + local locBool = testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)] + locArr|= locBool << zIter + processMaskData( locBool, zIter, resShmem) + end + sync_threads() + #fillGlobalFromShmem(testArrOut,resShmem) + + return + end + + @cuda threads=(32,32) blocks=1 testKernprocessMaskDataB(testArrIn,testArrOut) + +end + + +# testArrIn = CUDA.ones(Bool,32,32,32); +# testArrOut = CUDA.zeros(Bool,34,34,34); + +# function testKernprocessMaskDataB(testArrInn,testArrOut) +# resShmem = @cuStaticSharedMem(Bool,(34,34,34))#+2 in order to get the one padding +# clearMainShmem(resShmem) +# blockBeginingX=UInt8(0) +# blockBeginingY=UInt8(0) +# blockBeginingZ=UInt8(0) +# isMaskFull= false +# isMaskEmpty= false +# #here we will store in registers data uploaded from mask for later verification wheather we should send it or not +# locArr= UInt32(0) + +# @unroll for zIter in UInt8(1):UInt8(32)# most outer loop is responsible for z dimension +# processMaskData( testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)], zIter, resShmem,locArr) +# end +# sync_threads() +# #fillGlobalFromShmem(testArrOut,resShmem) + +# return +# end + + +# @cuda threads=(32,32) blocks=1 testKernprocessMaskDataB(testArrIn,testArrOut) + + + + + + + + + + +# numb = UInt32(0) +# #settingcorrectly +# numb |= true << UInt8(2) +# numb + + +# numb = UInt32(0) +# #settingcorrectly +# numb |= 1 << 1 +# numb |= 1 << 2 +# numb |= 1 << 0 +# numb |= 1 << 0 +# numb |= 1 << 5 +# numb + + +#reading... +# numb>>1 & UInt32(1) +# numb>>2 & UInt32(1) +# numb>>3 & UInt32(1) +# numb>>4 & UInt32(1) +# numb>>5 & UInt32(1) + + +# #processMaskDataB( testArrInn[threadIdxY(),1,1], zIter, resShmem ) # coalesced \ No newline at end of file diff --git a/test/distanceMetr/HousdorffTest/HFUtilsTest.jl b/test/distanceMetr/HousdorffTest/HFUtilsTest.jl index 380be5e..3bb3603 100644 --- a/test/distanceMetr/HousdorffTest/HFUtilsTest.jl +++ b/test/distanceMetr/HousdorffTest/HFUtilsTest.jl @@ -4,7 +4,7 @@ using Test, Revise includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\utils\\CUDAGpuUtils.jl") includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\HFUtils.jl") - +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\test\\GPUtestUtils.jl") using Main.HFUtils using Main.CUDAGpuUtils,Cthulhu,BenchmarkTools , CUDA, StaticArrays @@ -91,21 +91,22 @@ end # clearLocArr resShmem[threadIdxX()+2,threadIdxY(),z ]=1 resShmem[threadIdxX(),threadIdxY()+2,z ]=1 end + fillGlobalFromShmem(testArrInn,resShmem) # clearPadding(resShmem) - for z in 1:34 - testArrInn[threadIdxX(),threadIdxY(),z ]=resShmem[threadIdxX(),threadIdxY(),z ] - testArrInn[threadIdxX()+2,threadIdxY()+2,z ]= resShmem[threadIdxX()+2,threadIdxY()+2,z ] - testArrInn[threadIdxX(),threadIdxY()+2,z ]= resShmem[threadIdxX(),threadIdxY()+2,z ] - testArrInn[threadIdxX()+2,threadIdxY(),z ]= resShmem[threadIdxX()+2,threadIdxY(),z ] - end + # for z in 1:34 + # testArrInn[threadIdxX(),threadIdxY(),z ]=resShmem[threadIdxX(),threadIdxY(),z ] + # testArrInn[threadIdxX()+2,threadIdxY()+2,z ]= resShmem[threadIdxX()+2,threadIdxY()+2,z ] + # testArrInn[threadIdxX(),threadIdxY()+2,z ]= resShmem[threadIdxX(),threadIdxY()+2,z ] + # testArrInn[threadIdxX()+2,threadIdxY(),z ]= resShmem[threadIdxX()+2,threadIdxY(),z ] + # end return end @cuda threads=(32,32) blocks=1 testclearPadding(testArr) - @test (length(testArr)-sum(testArr)) ==0 + @test (length(testArr)-(sum(testArr)+(4*34)+(8*32) )) ==0 @@ -125,21 +126,9 @@ sync_threads() clearPadding(resShmem) clearMainShmem(resShmem) sync_threads() - for z in 1:34 - testArrInn[threadIdxX()+1,threadIdxY()+1,z ]=resShmem[threadIdxX()+1,threadIdxY()+1,z ] - sync_threads() - end - - for z in 1:32 - testArrInn[threadIdxX()+1,threadIdxY()+2,z+1 ]= resShmem[threadIdxX()+1,threadIdxY()+2,z+1 ] - sync_threads() - testArrInn[threadIdxX()+1,threadIdxY(),z+1 ]= resShmem[threadIdxX()+1,threadIdxY()+2,z+1 ] - sync_threads() - testArrInn[threadIdxX()+2,threadIdxY()+1,z+1 ]= resShmem[threadIdxX()+2,threadIdxY()+1,z+1 ] - sync_threads() - testArrInn[threadIdxX(),threadIdxY()+1,z+1 ]= resShmem[threadIdxX(),threadIdxY()+1,z+1 ] - sync_threads() - end + + fillGlobalFromShmem(testArrInn,resShmem) + return end @cuda threads=(32,32) blocks=1 testclearPadding(testArrB) @@ -152,3 +141,67 @@ end#clearPadding +# macro addArguments(x, ex) +# return esc(:(if threadIdxX()==$x +# $ex +# end)) + +# end + + +# macro times3(ex) +# return _times3(ex) +# end + + + +# function _times3(ex) +# # if ex.head == :call && ex.args[1] == :+ +# # ex.args[1] = :* +# # end +# push!(ex.args,3) + +# return ex +# end + + +# function addd(aa,bb) +# return aa+bb +# end + +# a = 2; b = 3 + +# @times3 addd(a) + + +# function outer(bb) +# aa=5 +# addd(bb) +# end + +# using MacroTools + +# ex = quote +# struct someStr +# x::Int +# y +# end +# end + +# prettify(ex) + + +# @capture(ex, struct T_ fields__ end) +# T, fields + + +# exB = quote +# xi::Int = 2 +# xiB::Int = 2 +# xiC::Int = 2 +# xiD::Int = 2 +# end + +# MacroTools.prewalk(x -> @show(x) isa Symbol ? x : x, exB); + +# @capture(exB, Symbol__ ) \ No newline at end of file diff --git a/test/distanceMetr/HousdorffTest/ProcessMainDataTest.jl b/test/distanceMetr/HousdorffTest/ProcessMainDataTest.jl new file mode 100644 index 0000000..87b8f3f --- /dev/null +++ b/test/distanceMetr/HousdorffTest/ProcessMainDataTest.jl @@ -0,0 +1,295 @@ +using Test, Revise +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\utils\\CUDAGpuUtils.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\HFUtils.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\ProcessMainData.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\test\\GPUtestUtils.jl") + +using Main.HFUtils +using Main.CUDAGpuUtils,Cthulhu,BenchmarkTools , CUDA, StaticArrays + +using Main.HFUtils, Main.ProcessMainData,CUDA,Main.CUDAGpuUtils,StaticArrays +using Main.CUDAGpuUtils,Cthulhu,BenchmarkTools , CUDA, StaticArrays + + +@testset "processMaskData" begin + + + ####### first it should be zeros + testArrIn = CUDA.zeros(Bool,34,34,34); + testArrOut = CUDA.zeros(Bool,34,34,34); + function testKernprocessMaskData(testArrInn,testArrOut) + resShmem = @cuStaticSharedMem(Bool,(34,34,34))#+2 in order to get the one padding + clearMainShmem(resShmem) + blockBeginingX=1 + blockBeginingY=1 + blockBeginingZ=1 + isMaskFull= zeros(MVector{1,Bool}) + isMaskEmpty= ones(MVector{1,Bool}) + #here we will store in registers data uploaded from mask for later verification wheather we should send it or not + locArr= UInt32(0) + + @unroll for zIter in UInt8(1):UInt8(32)# most outer loop is responsible for z dimension + processMaskData( testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)], zIter, resShmem) + end + fillGlobalFromShmem(testArrOut,resShmem) + + return + end + @cuda threads=(32,32) blocks=1 testKernprocessMaskData(testArrIn,testArrOut) + @test sum(testArrOut)==0 + ########### now we get one arbitrary point it should lead to 6 in output + testArrIn = CUDA.zeros(Bool,32,32,32); + testArrOut = CUDA.zeros(Bool,34,34,34); + + testArrIn[5,5,5]=true + function testKernprocessMaskDataB(testArrInn,testArrOut) + resShmem = @cuStaticSharedMem(Bool,(34,34,34))#+2 in order to get the one padding + clearMainShmem(resShmem) + blockBeginingX=0 + blockBeginingY=0 + blockBeginingZ=0 + isMaskFull= zeros(MVector{1,Bool}) + isMaskEmpty= ones(MVector{1,Bool}) + #here we will store in registers data uploaded from mask for later verification wheather we should send it or not + locArr= UInt32(0) + + @unroll for zIter in UInt8(1):UInt8(32)# most outer loop is responsible for z dimension + processMaskData( testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)], zIter, resShmem) + end + sync_threads() + fillGlobalFromShmem(testArrOut,resShmem) + + return + end + @cuda threads=(32,32) blocks=1 testKernprocessMaskDataB(testArrIn,testArrOut) + + @test sum(testArrOut)==6 + @test testArrOut[5,5,5]==false + @test testArrOut[7,6,6]==true + @test testArrOut[4+1,5+1,5+1]==true + @test testArrOut[5+1,4+1,5+1]==true + @test testArrOut[5+1,6+1,5+1]==true + @test testArrOut[5+1,5+1,4+1]==true + @test testArrOut[5+1,5+1,6+1]==true + + getIndiciesWithTrue(testArrOut) + + ########### checking corner cases + testArrIn = CUDA.zeros(Bool,32,32,32); + testArrOut = CUDA.zeros(Bool,34,34,34); + testLocArr = CUDA.zeros(UInt32,32,32,32); + + + testArrIn[1,1,1]=true + function testKernprocessMaskDataBB(testArrInn,testArrOut) + resShmem = @cuStaticSharedMem(Bool,(34,34,34))#+2 in order to get the one padding + clearMainShmem(resShmem) + blockBeginingX=UInt8(0) + blockBeginingY=UInt8(0) + blockBeginingZ=UInt8(0) + isMaskFull= false + isMaskEmpty= false + #here we will store in registers data uploaded from mask for later verification wheather we should send it or not + locArr= UInt32(0) + + @unroll for zIter in UInt8(1):UInt8(32)# most outer loop is responsible for z dimension + #locArr|= true << zIter + processMaskData( testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)], zIter, resShmem) + # testLocArr[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]=locArr + end + sync_threads() + fillGlobalFromShmem(testArrOut,resShmem) + + return + end + @cuda threads=(32,32) blocks=1 testKernprocessMaskDataBB(testArrIn,testArrOut) + @test sum(testArrOut)==6 + + @test testArrOut[1,1,1]==false + @test testArrOut[2,2,1]==true + #locArr|= true << zIter + + CUDA.reclaim()# just to destroy from gpu our dummy data + + +### stress test with ones data + +testArrIn = CUDA.ones(Bool,34,34,34); +testArrOut = CUDA.zeros(Bool,34,34,34); + +function testKernprocessMaskDataB(testArrInn,testArrOut) + resShmem = @cuStaticSharedMem(Bool,(34,34,34))#+2 in order to get the one padding + clearMainShmem(resShmem) + blockBeginingX=0 + blockBeginingY=0 + blockBeginingZ=0 + + #here we will store in registers data uploaded from mask for later verification wheather we should send it or not + locArr= UInt32(0) + + @unroll for zIter in UInt8(1):UInt8(32)# most outer loop is responsible for z dimension + local locBool = testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)] + # locArr|= locBool << zIter + processMaskData( locBool, zIter, resShmem) + CUDA.unsafe_free!(locBool) + end + sync_threads() + #CUDA.@cuprint "locArr $(Int64(locArr)) \n" + + fillGlobalFromShmem(testArrOut,resShmem) + + return +end +@cuda threads=(32,32) blocks=1 testKernprocessMaskDataB(testArrIn,testArrOut) +@test (length(testArrOut)-(sum(testArrOut)+(4*34)+(8*32) )) ==0 + +end#processMaskData + + + + + +using Test, Revise,CUDA + +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\utils\\CUDAGpuUtils.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\HFUtils.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\ProcessMainData.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\test\\GPUtestUtils.jl") + + +using Main.CUDAGpuUtils, Main.HFUtils + + + + + + +@testset "processMaskData" begin + + testArrIn = CUDA.ones(Bool,32,32,32); + referenceArray= CUDA.ones(Bool,32,32,32); + resArray = CUDA.zeros(UInt16,32,32,32); + resArraysCounter=CUDA.ones(UInt32,1); + isMaskFull = Ref(false) + isMaskEmpty = Ref(false) + locArr = Ref(Int32(0)) + + + function testprocessMaskData(testArrInn::CuDeviceArray{Bool, 3, 1},resArray,referenceArray,resArraysCounter,isMaskFull::CUDA.CuRefValue{Bool},isMaskEmpty::CUDA.CuRefValue{Bool},locArr) + blockBeginingX,blockBeginingY,blockBeginingZ =UInt8(0),UInt8(0),UInt8(0) + resShmem = @cuStaticSharedMem(Bool,(34,34,34))#+2 in order to get the one padding + #locArr = Int32(0) + #locArr.x |= true << UInt8(2) + + for zIter::UInt8 in UInt8(1):UInt8(32)# most outer loop is responsible for z dimension + locBool::Bool = @inbounds testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)] + #ProcessMainData.myIncreaseBitt(locBool, zIter,locArr) + #locArr.x|= locBool << zIter + locArr.x|= true << 4 + #locArr[] |= locBool << UInt8(zIter) + processMaskData( locBool, zIter, resShmem) + end#for + + sync_threads() #we should have in resShmem what we need + + # @unroll for zIter::UInt8 in UInt8(1):UInt8(32) # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately + # local locBoolRegister::Bool = (locArr.x>>zIter & 1)==1 + # local locBoolShmem::Bool = @inbounds resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1] + # #validataDataFirstPass(locBoolRegister,locBoolShmem,resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,testArrInn, referenceArray,resArray,resArraysCounter,zIter) + # #CUDA.unsafe_free!(locBoolRegister) + # # CUDA.unsafe_free!(locBoolShmem) + # end#for + + return + +end#testprocessMaskData + +@cuda threads=(32,32) blocks=1 testprocessMaskData(testArrIn,resArray,referenceArray,resArraysCounter,isMaskFull,isMaskEmpty,locArr) +isMaskFull + +@device_code_warntype interactive=true @cuda testprocessMaskData(testArrIn,resArray,referenceArray,resArraysCounter,isMaskFull,isMaskEmpty,locArr) + + +outB= false +function modBool(bb) + bb=true +end +modBool(outB) +outB + + +rr = Ref(false) + +function modBoolB(rrr) + rrr[]=true +end +modBoolB(rr) + +rr[] + + + + + +end + + + + + + + +# # nv-nsight-cu-cli --mode=launch julia +# using Test, Revise + +# includet("C:\\GitHub\\GitHub\\NuclearMedEval\\test\\aPrfofiling\\profilingProcessMaskData.jl") + +# CUDA.@profile wrapForProfile() + + +# using Test, Revise + +# includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\utils\\CUDAGpuUtils.jl") +# includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\HFUtils.jl") +# includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\ProcessMainData.jl") +# includet("C:\\GitHub\\GitHub\\NuclearMedEval\\test\\GPUtestUtils.jl") + +# using Main.HFUtils, Main.ProcessMainData,CUDA,Main.CUDAGpuUtils,StaticArrays + + +# function wrapForProfile() + +# testArrIn = CUDA.ones(Bool,34,34,34); +# testArrOut = CUDA.zeros(Bool,34,34,34); + +# function testKernprocessMaskDataB(testArrInn,testArrOut) +# resShmem = @cuStaticSharedMem(Bool,(34,34,34))#+2 in order to get the one padding +# clearMainShmem(resShmem) +# blockBeginingX=UInt8(0) +# blockBeginingY=UInt8(0) +# blockBeginingZ=UInt8(0) +# isMaskFull= zeros(MVector{1,Bool}) +# isMaskEmpty= ones(MVector{1,Bool}) +# #here we will store in registers data uploaded from mask for later verification wheather we should send it or not +# locArr= zeros(MVector{32,Bool}) + +# @unroll for zIter in UInt8(1):UInt8(32)# most outer loop is responsible for z dimension +# processMaskData( testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)], zIter, resShmem,locArr) +# end +# sync_threads() + +# fillGlobalFromShmem(testArrOut,resShmem) + +# return +# end +# @cuda threads=(32,32) blocks=1 testKernprocessMaskDataB(testArrIn,testArrOut) +# end +# CUDA.@profile wrapForProfile() + + + +# CUDA.reclaim()# just to destroy from gpu our dummy data + +# end # processMaskData + + diff --git a/test/distanceMetr/HousdorffTest/forDebugSeparate.jl b/test/distanceMetr/HousdorffTest/forDebugSeparate.jl new file mode 100644 index 0000000..25cb2e4 --- /dev/null +++ b/test/distanceMetr/HousdorffTest/forDebugSeparate.jl @@ -0,0 +1,58 @@ +using Test, Revise +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\utils\\CUDAGpuUtils.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\HFUtils.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\src\\distanceMetrics\\Housdorff\\mainHouseDorffKernel\\ProcessMainData.jl") +includet("C:\\GitHub\\GitHub\\NuclearMedEval\\test\\GPUtestUtils.jl") + +using Main.HFUtils +using Main.CUDAGpuUtils,BenchmarkTools , CUDA, StaticArrays + +using Main.HFUtils, Main.ProcessMainData,CUDA,Main.CUDAGpuUtils,StaticArrays +using Main.CUDAGpuUtils,BenchmarkTools , CUDA, StaticArrays + + + + + + + +testArrIn = CUDA.ones(Bool,32,32,32); +referenceArray= CUDA.ones(Bool,32,32,32); +resArray = CUDA.zeros(UInt16,32,32,32); +resArraysCounter=CUDA.ones(UInt32,1); +isMaskFull = Ref(false) +isMaskEmpty = Ref(false) +locArr = Ref(Int32(0)) + + + + +function testprocessMaskData(testArrInn::CuDeviceArray{Bool, 3, 1},resArray,referenceArray,resArraysCounter,isMaskFull::CUDA.CuRefValue{Bool},isMaskEmpty::CUDA.CuRefValue{Bool},locArr) +blockBeginingX,blockBeginingY,blockBeginingZ =UInt8(0),UInt8(0),UInt8(0) +resShmem = @cuStaticSharedMem(Bool,(34,34,34))#+2 in order to get the one padding +#locArr = Int32(0) +#locArr.x |= true << UInt8(2) + +for zIter::UInt8 in UInt8(1):UInt8(32)# most outer loop is responsible for z dimension + locBool::Bool = @inbounds testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)] + locArr.x|= locBool << zIter + #locArr[] |= locBool << UInt8(zIter) + ProcessMainData.processMaskData( locBool, zIter, resShmem) +end#for + +sync_threads() #we should have in resShmem what we need + +# @unroll for zIter::UInt8 in UInt8(1):UInt8(32) # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately +# local locBoolRegister::Bool = (locArr[]>>zIter & UInt32(1))==UInt32(1) +# local locBoolShmem::Bool = resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1] +# ProcessMainData.validataDataFirstPass(locBoolRegister,locBoolShmem,resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,testArrInn, referenceArray,resArray,resArraysCounter,zIter) +# #CUDA.unsafe_free!(locBoolRegister) +# # CUDA.unsafe_free!(locBoolShmem) +# end#for + +return + +end#testprocessMaskData + +@cuda threads=(32,32) blocks=1 testprocessMaskData(testArrIn,resArray,referenceArray,resArraysCounter,isMaskFull,isMaskEmpty,locArr) +