Skip to content

Commit

Permalink
protoyping
Browse files Browse the repository at this point in the history
  • Loading branch information
jakubMitura14 committed Sep 26, 2021
1 parent a50e48d commit a52ab90
Show file tree
Hide file tree
Showing 10 changed files with 707 additions and 87 deletions.
1 change: 1 addition & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [

{
"type": "julia",
"request": "launch",
Expand Down
4 changes: 2 additions & 2 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -440,9 +440,9 @@ version = "2.1.2"

[[MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7"
git-tree-sha1 = "5a5bc6bf062f0f95e62d0fe0a2d99699fed82dd9"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.7"
version = "0.5.8"

[[Markdown]]
deps = ["Base64"]
Expand Down
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
InfoZIP = "f4508453-b816-52ab-a864-26fc7f6211fc"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
MedEye3d = "48a1af7b-3279-4eeb-8f2b-7ca229bb51b1"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
Expand Down
131 changes: 70 additions & 61 deletions src/distanceMetrics/Housdorff/mainHouseDorffKernel/ProcessMainData.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ loads and do the main processing of data in arrays of intrest (padding of shmem
"""
module ProcessMainData

export executeDataIterFirstPass,executeDataIterOtherPasses
using StaticArrays,Main.CUDAGpuUtils ,Main.HFUtils, CUDA
export executeDataIterFirstPass,executeDataIterOtherPasses,processMaskData


"""
Expand All @@ -22,27 +22,41 @@ resArray- 3 dimensional array where we put results
"""
function executeDataIterFirstPass(analyzedArr, refAray,blockBeginingX,blockBeginingY,blockBeginingZ,isMaskFull,isMaskEmpty,resShmem,locArr,resArray)
@unroll for zIter in UInt16(1):32# most outer loop is responsible for z dimension
processMaskData( analyzedArr[x,y,z+zIter], zIter, resShmem,locArr)
end#for
sync_threads() #we should have in resShmem what we need
@unroll for zIter in UInt16(1):32 # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately
validataDataFirstPass(locArr[32],resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1],resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter)
end#for
end#executeDataIter

"""
specializes executeDataIterFirstPass as it do not consider possibility of block being empty
"""
function executeDataIterOtherPasses(analyzedArr, refAray,iterationNumber ,blockBeginingX,blockBeginingY,blockBeginingZ,isMaskFull,resShmem,locArr,resArray,resArraysCounter)
@unroll for zIter in UInt16(1):32# most outer loop is responsible for z dimension
processMaskData( analyzedArr[x,y,z+zIter], zIter, resShmem,locArr )
local locBool = testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]
locArr|= locBool << zIter
processMaskData( locBool, zIter, resShmem)
CUDA.unsafe_free!(locBool)
end#for
sync_threads() #we should have in resShmem what we need
@unroll for zIter in UInt16(1):32 # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately
validataData(locArr[32],resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1],32,resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter)
local locBoolRegister = (locArr>>zIter & UInt32(1))==UInt32(1)
local locBoolShmem = resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1]
validataDataFirstPass(locBoolRegister,locBoolShmem,resShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter,zIter)
end#for
end#executeDataIter

# """
# specializes executeDataIterFirstPass as it do not consider possibility of block being empty
# """
# function executeDataIterOtherPasses(analyzedArr, refAray,iterationNumber ,blockBeginingX,blockBeginingY,blockBeginingZ,isMaskFull,resShmem,locArr,resArray,resArraysCounter)
# @unroll for zIter in UInt16(1):32# most outer loop is responsible for z dimension
# local locBool = testArrInn[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]
# locArr|= locBool << zIter
# processMaskData( locBool, zIter, resShmem)
# CUDA.unsafe_free!(locBool)
# end#for
# sync_threads() #we should have in resShmem what we need
# @unroll for zIter in UInt16(1):32 # most outer loop is responsible for z dimension - importnant in this loop we ignore padding we will deal with it separately
# local locBoolRegister = (locArr>>zIter & UInt32(1))==1
# local locBoolShmem = resShmem[threadIdxX()+1,threadIdxY()+1,zIter+1]
# validataDataOtherPass(locBoolRegister,locBoolShmem,isMaskFull,isMaskEmpty,blockBeginingX,blockBeginingY,blockBeginingZ,analyzedArr, refAray,resArray,resArraysCounter)
# CUDA.unsafe_free!(locBool)


# end#for
# end#executeDataIter

#numb>>1 & UInt32(1)



Expand All @@ -54,25 +68,30 @@ uploaded data from shared memory in amask of intrest gets processed in this func
- also we need to make sure that in corner cases we are getting to correct spot
"""
function processMaskData(maskBool::Bool
,zIter::UInt16
,zIter::UInt8
,resShmem
,locArr )
) #::CUDA.CuRefValue{Int32}
# save it to registers - we will need it later
locArr[zIter]=maskBool
#locArr[zIter]=maskBool
#now we are saving results evrywhere we are intrested in so around without diagonals (we use supremum norm instead of euclidean)
#locArr.x|= maskBool << zIter
if(maskBool)
resShmem[threadIdxX()+1,threadIdxY()+1,zIter]=true #up
resShmem[threadIdxX()+1,threadIdxY()+1,zIter+2]=true #down
@inbounds resShmem[threadIdxX()+1,threadIdxY()+1,zIter]=true #up
@inbounds resShmem[threadIdxX()+1,threadIdxY()+1,zIter+2]=true #down

resShmem[threadIdxX(),threadIdxY()+1,zIter+1]=true #left
resShmem[threadIdxX()+2,threadIdxY()+1,zIter+1]=true #right
@inbounds resShmem[threadIdxX(),threadIdxY()+1,zIter+1]=true #left
@inbounds resShmem[threadIdxX()+2,threadIdxY()+1,zIter+1]=true #right

resShmem[threadIdxX()+1,threadIdxY()+2,zIter+1]=true #front
resShmem[threadIdxX()+1,threadIdxY(),zIter+1]=true #back
@inbounds resShmem[threadIdxX()+1,threadIdxY()+2,zIter+1]=true #front
@inbounds resShmem[threadIdxX()+1,threadIdxY(),zIter+1]=true #back
end#if

end#processMaskData

function myIncreaseBitt(maskBool::Bool,zIter::UInt8, locArr::CUDA.CuRefValue{Int32} )::Bool
locArr.x|= maskBool << zIter
return true
end


"""
-so we uploaded all data that we consider new - around voxels that are "true" but we can be sure that some of those were already true earlier
Expand All @@ -88,54 +107,56 @@ locVal - value from registers
shmemVal - value associated with this thread from shared memory - where we marked neighbours ...
resShmem - shared memory with our preliminary results
isMaskFull, isMaskEmpty - register values needed to specify weather we have full or empty or neither block
x,y,z - needed to access data from main data array in global memory
blockBeginingX,blockBeginingY,blockBeginingZ - coordinates where our block is begining - will be used as offset by our threads
masktoUpdate - mask that we analyzed and now we write to data about dilatation
maskToCompare - the other mask that we need to check before we write to result array
iterationNumber - in which iteration we are currently - the bigger it is the higher housedorfrf,,
resArray
"""
function validataDataFirstPass(locVal::Bool
,shmemVal::Bool
,resShmem
,isMaskFull::MVector{1, Bool}
,isMaskEmpty::MVector{1, Bool}
,x::UInt16
,y::UInt16
,z::UInt16
,isMaskFull#::CUDA.CuRefValue{Bool}
,isMaskEmpty#::CUDA.CuRefValue{Bool}
,blockBeginingX::UInt8
,blockBeginingY::UInt8
,blockBeginingZ::UInt8
,maskToCompare
,masktoUpdate
,resArray
,resArraysCounter)
,resArraysCounter
,zIter::UInt8)::Bool
#when this one and previous is true it will still be true
setIsFullOrEmpty!((locVal | shmemVal),isMaskFull,isMaskEmpty )
locValOrShmem = (locVal | shmemVal)
isMaskFull.x= locValOrShmem & isMaskFull.x
isMaskEmpty.x = ~locValOrShmem & isMaskEmpty.x

if(!locVal && shmemVal)
# setting value in global memory
masktoUpdate[x,y,z+32]= true
@inbounds masktoUpdate[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]= true
# if we are here we have some voxel that was false in a primary mask and is becoming now true - if it is additionaly true in reference we need to add it to result
if(maskToCompare[x,y,z+32])
resArray[x,y,z+32]=UInt16(1)
CUDA.atomic_inc!(pointer(resArraysCounter), UInt16(1))
if(maskToCompare[@inbounds (blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)])
@inbounds resArray[(blockBeginingX+threadIdxX()),(blockBeginingY +threadIdxY()),(blockBeginingZ+zIter)]=UInt16(1)

atomicallyAddOneUint32(resArraysCounter)
end#if
end#if

return true
end

"""
specializes validataDataFirstPass ignoring case of potentially empty mask
iterationNumber - in which iteration we are currently - the bigger it is the higher housedorfrf,,
"""
function validataDataOtherPass(locVal::Bool
,shmemVal::Bool
,resShmem
,isMaskFull::MVector{1, Bool}
,isMaskEmpty::MVector{1, Bool}
,x::UInt16
,y::UInt16
,z::UInt16
,blockBeginingX,blockBeginingY,blockBeginingZ
,maskToCompare
,masktoUpdate
,resArray
,iterationNumber::UInt16
,resArraysCounter)
,resArraysCounter
,zIter)
#when this one and previous is true it will still be true
setIsFull!((locVal | shmemVal),isMaskEmpty )
if(!locVal && shmemVal)
Expand All @@ -153,18 +174,6 @@ end



"""
set the isMaskFull and isMaskEmpty
locVal - value of the voxel from registry (what was loaded from globl memory and not modified)
shmemVal - what was loaded in shared memory - so after dilatation
yet we pass into locValOrShmem (locVal | shmemVal)
"""
function setIsFullOrEmpty!(locValOrShmem::Bool
,isMaskFull::MVector{1, Bool}
,isMaskEmpty::MVector{1, Bool} )
isMaskFull[1]= locValOrShmem & isMaskFull[1]
isMaskEmpty[1] = ~locValOrShmem & isMaskEmpty[1]
end#setIsFullOrEmpty


function setIsFull!(locValOrShmem::Bool
Expand Down
24 changes: 23 additions & 1 deletion src/utils/CUDAGpuUtils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,20 @@ module CUDAGpuUtils

using CUDA, StaticArrays

export clearLocArrdefineIndicies,computeBlocksFromOccupancy,reduce_warp,getKernelContants,assignWorkToCooperativeBlocks,getMaxBlocksPerMultiproc,reduce_warp_max,reduce_warp_min,reduce_warp_min,reduce_warp_or,reduce_warp_and,blockIdxZ,blockIdxY,blockIdxX,blockDimZ, blockDimY, blockDimX, threadIdxX, threadIdxY, threadIdxZ
export atomicallyAddOneUint,clearLocArrdefineIndicies,computeBlocksFromOccupancy,reduce_warp,getKernelContants,assignWorkToCooperativeBlocks,getMaxBlocksPerMultiproc,reduce_warp_max,reduce_warp_min,reduce_warp_min,reduce_warp_or,reduce_warp_and,blockIdxZ,blockIdxY,blockIdxX,blockDimZ, blockDimY, blockDimX, threadIdxX, threadIdxY, threadIdxZ
export @unroll, @ifX, @ifY, @ifXY



"""
atomically add to given 1 length array 1
data type need to be UInt32
"""
function atomicallyAddOneUint(arr)
CUDA.atomic_inc!(pointer(arr), UInt16(1))
end


"""
convinience macro that will execute only if it has given thread Id X
"""
Expand Down Expand Up @@ -334,4 +345,15 @@ end #assignWorkToCooperativeBlocks














end #CUDAGpuUtils
42 changes: 42 additions & 0 deletions test/GPUtestUtils.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@


function fillGlobalFromShmem(testArrInn,resShmem)
for z in 1:34
testArrInn[threadIdxX()+1,threadIdxY()+1,z ]=resShmem[threadIdxX()+1,threadIdxY()+1,z ]
end

testArrInn[1,threadIdxX()+1,threadIdxY()+1]= resShmem[1,threadIdxX()+1,threadIdxY()+1]
testArrInn[34,threadIdxX()+1,threadIdxY()+1]= resShmem[34,threadIdxX()+1,threadIdxY()+1]
testArrInn[threadIdxX()+1,1,threadIdxY()+1]= resShmem[threadIdxX()+1,1,threadIdxY()+1]
testArrInn[threadIdxX()+1,34,threadIdxY()+1]= resShmem[threadIdxX()+1,34,threadIdxY()+1]


end


function getIndiciesWithTrue(arr)
indicies = CartesianIndices(arr)
return filter(ind-> arr[ind] ,indicies)


end


# function fillGlobalFromShmem(testArrInn,resShmem)
# for z in 1:34
# testArrInn[threadIdxX()+1,threadIdxY()+1,z ]=resShmem[threadIdxX()+1,threadIdxY()+1,z ]
# sync_threads()
# end

# for z in 1:32
# testArrInn[threadIdxX()+1,threadIdxY()+2,z+1 ]= resShmem[threadIdxX()+1,threadIdxY()+2,z+1 ]
# sync_threads()
# testArrInn[threadIdxX()+1,threadIdxY(),z+1 ]= resShmem[threadIdxX()+1,threadIdxY()+2,z+1 ]
# sync_threads()
# testArrInn[threadIdxX()+2,threadIdxY()+1,z+1 ]= resShmem[threadIdxX()+2,threadIdxY()+1,z+1 ]
# sync_threads()
# testArrInn[threadIdxX(),threadIdxY()+1,z+1 ]= resShmem[threadIdxX(),threadIdxY()+1,z+1 ]
# sync_threads()
# end

# end
Loading

0 comments on commit a52ab90

Please sign in to comment.