Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions 13_BitonicSort/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()
112 changes: 112 additions & 0 deletions 13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#include "nbl/builtin/hlsl/bda/bda_accessor.hlsl"

struct BitonicPushData
{
uint64_t inputKeyAddress;
uint64_t inputValueAddress;
uint64_t outputKeyAddress;
uint64_t outputValueAddress;
uint32_t dataElementCount;
};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Replace this with an #include "common.hlsl", otherwise you have the same struct two times

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you not including common.hlsl and having duplicate code?


using namespace nbl::hlsl;

[[vk::push_constant]] BitonicPushData pushData;

using DataPtr = bda::__ptr<uint32_t>;
using DataAccessor = BdaAccessor<uint32_t>;

groupshared uint32_t sharedKeys[ElementCount];

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good SoA instead of AoS, no bank conflicts

Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shared memory size is fixed, which means you shouldn't have a push constant dataElementSize

Unless it's purpose is to allow you to sort less elements than ElementCount but then you'd need to still initialize all of the sharedKeys with the highest possible value so you don't end up with garbage getting into the sort

groupshared uint32_t sharedValues[ElementCount];

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its cheaper to move Key & Index to Value instead of the Value instead, because Value might be quite fat (also for a workgroup scan you need only uint16_t)

Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also because the value never has to get accessed until the end.

So you can initialize sharedValueIndex with IOTA sequence (0,1,2,...) and not perform any value reads until the end where you do

value_t tmp;
inputValues.get(sharedValueIndex[i],tmp);
inputValues.set(i,tmp);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm actually you wouldn't initialize sharedValueIndex with IOTA but the result of the subgroup::BitonicSort


[numthreads(WorkgroupSize, 1, 1)]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some static asserts about workgroup size dividing the element mCount evenly and the element count being power of two would be nice

[shader("compute")]
void main(uint32_t3 dispatchId : SV_DispatchThreadID, uint32_t3 localId : SV_GroupThreadID)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use nbl::hlsl::glsl:: functions instead of weird : hlsl semantics, see other examples with gl_LocalInvocationIndex

{
const uint32_t threadId = localId.x;
const uint32_t dataSize = pushData.dataElementCount;
Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd make that constant and part of the Config, because the workgroup size needs to be hardcoded and as I've mentioned optimal amount of shared memory per invocation to use is <=8 dwords (32 bytes)


DataAccessor inputKeys = DataAccessor::create(DataPtr::create(pushData.inputKeyAddress));
DataAccessor inputValues = DataAccessor::create(DataPtr::create(pushData.inputValueAddress));

for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're already compiling the shader with ElementCount known as a compile-time constant. Instead of pushing it as a pc, you can check i < ElementCount. This lets you add an [unroll] attribute before the loop to hint to the compiler the number of iterations is known at compile time

{
inputKeys.get(i, sharedKeys[i]);
inputValues.get(i, sharedValues[i]);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very nice but why isn't this wrapped up into a statically polymorphic struct like FFT and workgroup prefix sum? So we can actually reuse the code ? You know with shared memory accessors etc.

}

// Synchronize all threads after loading
GroupMemoryBarrierWithGroupSync();
Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use glsl or Spir-V barrier, we dislike hlsl intrinsics



for (uint32_t stage = 0; stage < Log2ElementCount; stage++)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where does Log2ElementCount come from? I have a feeling that you shouldn't have a data size push constant then.

{
for (uint32_t pass = 0; pass <= stage; pass++)
{
const uint32_t compareDistance = 1 << (stage - pass);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two loops for stage and pass can be unrolled as well. However, this can affect occupancy since it's unrolling nested loops. I'd profile using NSight Graphics to check if unrolling is worth it here.


for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as before, you can unroll this one

{

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool use of virtual invocations

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one slight nitpick, this won't [unroll] were the dataSize a compile time constant.

to get a loop that unrolls you need to do

for (uint32_t baseIndex=0; baseIndex<dataSize; baseIndex+=WorkgroupSize)
{
   const uint32_t i = threadId+baseIndex;

compilers are dumb

const uint32_t partnerId = i ^ compareDistance;

if (partnerId >= dataSize)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can also try to prove that this never happens for power of two sized arrays

continue;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this causes a 50% efficiency drop when compareDistance<SubgroupSize because only half the wave is active, do your for loop from 0 to (dataSize>>1) (half as many)

and fit up your index calculations by inserting a 0 within the i bitpattern for the lowerIndex and 1 for the higher, see FFT code for that getting done

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw you might have some issues with making this work for a single subgroup, @Fletterio may have some ideas how to help you there (see his FFT blogpost, about trading halves of FFT with subgroup shuffles while only keeping one invocation active).

The main problem is that with a 32-sized subgroup you need to treat 64 items.

Ideally you'd want one invocation to get 2 keys and value indices, but keep them rearranged in such a way that always 32 invocations are active.

Meaning that with step sizes of 1,2,4,8,... subgroupSize/2 you need to cleverly "reuse" the "partner ID" invocation to do the work of the "would be" second subgroup.


const uint32_t waveSize = WaveGetLaneCount();

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thats a constant which should get hoisted out, btw if you make it a template uint supplied from a Config, the compiler will unroll certain loops (instead of counting on the SPIR-V optimizer in the driver to do it) thats why we do it in FFT code

const uint32_t myWaveId = i / waveSize;
const uint32_t partnerWaveId = partnerId / waveSize;
const bool sameWave = (myWaveId == partnerWaveId);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

integer division is expensive, it would have been better to do just bool((i^partner)&uint16_t(-SubgroupSize)) or just compareDistance>SubgroupSize


uint32_t myKey, myValue, partnerKey, partnerValue;
[branch]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This branch feels weird because it's, in principle, not subgroup uniform. For compareDistance < waveSize, i ^ compareDistance should ALWAYS yield sameWave = true while for compareDistance > waveSize it should ALWAYS yield sameWave = false, given how your threads take on elements of the array. Try to come up with a proof for this, I might be wrong, but it feels that way to me. This in turn lets you predicate the branch on just compareDistance < waveSize and yield neater code, since it's more obviously subgroup-uniform branching

if (sameWave && compareDistance < waveSize)
{
// WAVE INTRINSIC
myKey = sharedKeys[i];
myValue = sharedValues[i];

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's no need to read-write to shared during subgroup stages, you could keep the value in the register always


const uint32_t partnerLane = partnerId % waveSize;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modulo operations (and integer divisions) are realllllly expensive, especially since the compiler might not realize that WaveGetLaneCount() is a power of two (I'm not sure if this gets in practice compiled as a shader constant).

Prefer to write partnerId & (waveSize - 1), which is equivalent to the modulo operator when the divisor is power of two

partnerKey = WaveReadLaneAt(myKey, partnerLane);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or, since we know partnerId = i ^ compareDistance, you can use a subgroupShuffleXor (it's in glsl_compat/subgroup_shuffle.hlsl)

partnerValue = WaveReadLaneAt(myValue, partnerLane);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use our SPIR-V intrinsics from spirv or glsl namespace.

You may spot that there's a ShuffleXor which lets you simply read the lane at gl_SubgroupInvocationIndex^mask

}
else
{
// SHARED MEM
myKey = sharedKeys[i];
myValue = sharedValues[i];
partnerKey = sharedKeys[partnerId];
partnerValue = sharedValues[partnerId];
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the wholly subgroup stages (stage<=subgroupSizeLog2) need separation from the stages where the compareDistance can be big. Thats what I want wrapped up in subgroup::BitonicSort

Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw the workgroup stages would still use subgroup to finish off each stage, because compareDistance decreases eventually to subgroup sized


const uint32_t sequenceSize = 1 << (stage + 1);
const uint32_t sequenceIndex = i / sequenceSize;
const bool sequenceAscending = (sequenceIndex % 2) == 0;
const bool ascending = true;
const bool finalDirection = sequenceAscending == ascending;

const bool swap = (myKey > partnerKey) == finalDirection;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be simply expressed as bool(i & (0x1u<< (stage+2))), all that code is just checking for one bit being set in i


// WORKGROUP COORDINATION: Only lower-indexed element writes both
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This yields 4 write operations to shared memory. If you tweak the swap logic slightly (condition the key comparison based on whether i < partnerId), you can make each thread write (if a swap is necessary) to its own positions into the shared memory, reducing this to two write operations to shared memory (also, given how your threads access shared memory, there are no bank conflicts, so you're guaranteed this will result in only two write operations per thread)

if (i < partnerId && swap)
{
sharedKeys[i] = partnerKey;
sharedKeys[partnerId] = myKey;
sharedValues[i] = partnerValue;
sharedValues[partnerId] = myValue;
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the key is not using 2 invocations for 2 inputs, but 1 for 2, throughout the function

}

GroupMemoryBarrierWithGroupSync();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If compareDistance < waveSize, these barriers serve no purpose, you are overbarriering. In fact writing to shared memory at the end of every such iteration is also pointless.

The proper way to avoid this overbarriering is to branch behaviour based on whether compareDistance < waveSize or not. All steps with compareDistance < waveSize can be done in one go. Threads shuffle their elements around using subgroup intrinsics (shuffleXor, namely), once per every compareDistance value less than the starting one, and then write back to shared memory only once. This is what we do with the FFT, although I don't expect you to infer that from the code since it can be a bit obscure. @ me on discord if you want to figure out the way we handle this with the FFT, I can explain better there since I need to draw diagrams and write a bunch more

}
}


DataAccessor outputKeys = DataAccessor::create(DataPtr::create(pushData.outputKeyAddress));
DataAccessor outputValues = DataAccessor::create(DataPtr::create(pushData.outputValueAddress));

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Glad it works, but now the real work begins.

This needs to be split into subgroup::BitonicSort<Config> and workgroup::BitonicSort<Config with the latter using the former.
See the FFT and PrefixSum/Scan (reduce, inclusive_scan) code for inspiration of how to lay this out. I think they don't use our weird C++20/HLSL concepts but you should.

In the Config we need a SubgroupSize, KeyAccessor, ValueAccessor, Key Comparator (for a default see our hlsl::less binop), and obviously typedefs for value_t and key_t

Only the workgroup config needs a ScratchKeyAccessor (what will mediate your smem accesses), ScratchValueAccessor.

The reason is that this lets us choose to sort values from/to BDA, SSBO, Smem, Images, and use various memory as scratch (different offsets of shared, reuse shared mem allocations).

And most importantly, this lets us choose whether to load the Keys and move them around in scratch or move their indices and load them from main memory every time (useful if the Key is super fat) without touching the Bitonic sort code, the user would just provide an appropriate scratch-key accessor and comparator which both translate the argument to/from index.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Items Per Invocation should also come from Config.

Btw with a uint16_t value index and 1 invocation always processing a pair of items, this means a baseline shared memory consumption of 8 bytes per invocation if the key is also either 2 bytes or an index of a key.

Realistically max virtual thread repetition is 4, in your case with an actual 32bit key getting used directly, its 3 but with optimal workgroup size of 512 on NV to keep the multiple PoT it becomes 2.

As you can see the point at which one should consider indexing keys instead of using them out right is 14 byte keys.

Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw when doing subgroup bitonics its also possible to have multiple items per invocation (more than 2), so its useful to bitonic sort in registers first (see subgroup prefix sum and FFT doing stuff locally within an invocation first).

Btw barriers are expensive, so make sure that the repeated items sit at the most minor level of indexing.
So that the Subgroup has to access

Key keys[2<<ElementsPerInvocationLog2];

this way the last ElementsPerInvocationLog2 passes in a stage execute without any barriers (subgroup or workgroup).


for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
{
outputKeys.set(i, sharedKeys[i]);
outputValues.set(i, sharedValues[i]);
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a closing remark, it should be possible to run this algo on arrays bigger than the maximum shared memory allowed per workgroup, using what we call virtual threads

17 changes: 17 additions & 0 deletions 13_BitonicSort/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h
#ifndef _BITONIC_SORT_COMMON_INCLUDED_
#define _BITONIC_SORT_COMMON_INCLUDED_

struct BitonicPushData
{

uint64_t inputKeyAddress;
uint64_t inputValueAddress;
uint64_t outputKeyAddress;
uint64_t outputValueAddress;
uint32_t dataElementCount;
};

#endif
28 changes: 28 additions & 0 deletions 13_BitonicSort/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
"profiles": [
{
"backend": "vulkan", // should be none
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
Loading