Skip to content

Commit

Permalink
MSL: Support SPV_KHR_shader_ballot and SPV_KHR_subgroup_vote.
Browse files Browse the repository at this point in the history
Normally, I wouldn't have bothered with this, given that we already
support the Vulkan 1.1 subgroup functionality, but a client asked for
the legacy extensions.
  • Loading branch information
cdavis5e committed Jun 24, 2023
1 parent 2d3a152 commit 00f14ce
Show file tree
Hide file tree
Showing 7 changed files with 362 additions and 39 deletions.
77 changes: 77 additions & 0 deletions reference/opt/shaders-msl/comp/shader_ballot.msl22.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"

#include <metal_stdlib>
#include <simd/simd.h>

using namespace metal;

template<typename T>
inline T spvSubgroupBroadcast(T value, ushort lane)
{
return simd_broadcast(value, lane);
}

template<>
inline bool spvSubgroupBroadcast(bool value, ushort lane)
{
return !!simd_broadcast((ushort)value, lane);
}

template<uint N>
inline vec<bool, N> spvSubgroupBroadcast(vec<bool, N> value, ushort lane)
{
return (vec<bool, N>)simd_broadcast((vec<ushort, N>)value, lane);
}

template<typename T>
inline T spvSubgroupBroadcastFirst(T value)
{
return simd_broadcast_first(value);
}

template<>
inline bool spvSubgroupBroadcastFirst(bool value)
{
return !!simd_broadcast_first((ushort)value);
}

template<uint N>
inline vec<bool, N> spvSubgroupBroadcastFirst(vec<bool, N> value)
{
return (vec<bool, N>)simd_broadcast_first((vec<ushort, N>)value);
}

inline uint4 spvSubgroupBallot(bool value)
{
simd_vote vote = simd_ballot(value);
// simd_ballot() returns a 64-bit integer-like object, but
// SPIR-V callers expect a uint4. We must convert.
// FIXME: This won't include higher bits if Apple ever supports
// 128 lanes in an SIMD-group.
return uint4(as_type<uint2>((simd_vote::vote_t)vote), 0, 0);
}

struct inputData
{
float inputDataArray[1];
};

struct outputData
{
float outputDataArray[1];
};

constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);

kernel void main0(device inputData& _12 [[buffer(0)]], device outputData& _87 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint gl_SubgroupInvocationID [[thread_index_in_simdgroup]])
{
uint4 gl_SubgroupLtMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0));
bool _31 = _12.inputDataArray[gl_LocalInvocationID.x] > 0.0;
uint4 _52 = spvSubgroupBallot(_31);
uint4 _66 = uint4(int4(popcount(uint4(as_type<uint2>(as_type<ulong>(uint2(gl_SubgroupLtMask.xy))), 0u, 0u) & uint4(as_type<uint2>(as_type<ulong>(uint2(_52.xy))), 0u, 0u))));
if (_31)
{
_87.outputDataArray[_66.x + _66.y] = _12.inputDataArray[gl_LocalInvocationID.x];
}
}

37 changes: 37 additions & 0 deletions reference/opt/shaders-msl/comp/shader_group_vote.msl21.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"

#include <metal_stdlib>
#include <simd/simd.h>

using namespace metal;

template<typename T>
inline bool spvSubgroupAllEqual(T value)
{
return simd_all(all(value == simd_broadcast_first(value)));
}

template<>
inline bool spvSubgroupAllEqual(bool value)
{
return simd_all(value) || !simd_any(value);
}

template<uint N>
inline bool spvSubgroupAllEqual(vec<bool, N> value)
{
return simd_all(all(value == (vec<bool, N>)simd_broadcast_first((vec<ushort, N>)value)));
}

struct inputData
{
float inputDataArray[1];
};

constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);

kernel void main0(device inputData& _12 [[buffer(0)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
bool _31 = _12.inputDataArray[gl_LocalInvocationID.x] > 0.0;
}

80 changes: 80 additions & 0 deletions reference/shaders-msl/comp/shader_ballot.msl22.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"

#include <metal_stdlib>
#include <simd/simd.h>

using namespace metal;

template<typename T>
inline T spvSubgroupBroadcast(T value, ushort lane)
{
return simd_broadcast(value, lane);
}

template<>
inline bool spvSubgroupBroadcast(bool value, ushort lane)
{
return !!simd_broadcast((ushort)value, lane);
}

template<uint N>
inline vec<bool, N> spvSubgroupBroadcast(vec<bool, N> value, ushort lane)
{
return (vec<bool, N>)simd_broadcast((vec<ushort, N>)value, lane);
}

template<typename T>
inline T spvSubgroupBroadcastFirst(T value)
{
return simd_broadcast_first(value);
}

template<>
inline bool spvSubgroupBroadcastFirst(bool value)
{
return !!simd_broadcast_first((ushort)value);
}

template<uint N>
inline vec<bool, N> spvSubgroupBroadcastFirst(vec<bool, N> value)
{
return (vec<bool, N>)simd_broadcast_first((vec<ushort, N>)value);
}

inline uint4 spvSubgroupBallot(bool value)
{
simd_vote vote = simd_ballot(value);
// simd_ballot() returns a 64-bit integer-like object, but
// SPIR-V callers expect a uint4. We must convert.
// FIXME: This won't include higher bits if Apple ever supports
// 128 lanes in an SIMD-group.
return uint4(as_type<uint2>((simd_vote::vote_t)vote), 0, 0);
}

struct inputData
{
float inputDataArray[1];
};

struct outputData
{
float outputDataArray[1];
};

constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);

kernel void main0(device inputData& _12 [[buffer(0)]], device outputData& _87 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint gl_SubgroupInvocationID [[thread_index_in_simdgroup]])
{
uint4 gl_SubgroupLtMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0));
float thisLaneData = _12.inputDataArray[gl_LocalInvocationID.x];
bool laneActive = thisLaneData > 0.0;
uint4 activeSlots = uint4(int4(popcount(uint4(as_type<uint2>(as_type<ulong>(uint2(gl_SubgroupLtMask.xy))), 0u, 0u) & uint4(as_type<uint2>(as_type<ulong>(uint2(spvSubgroupBallot(laneActive).xy))), 0u, 0u))));
uint thisLaneOutputSlot = activeSlots.x + activeSlots.y;
int firstInvocation = spvSubgroupBroadcastFirst(1);
int invocation = spvSubgroupBroadcast(1, 0u);
if (laneActive)
{
_87.outputDataArray[thisLaneOutputSlot] = thisLaneData;
}
}

41 changes: 41 additions & 0 deletions reference/shaders-msl/comp/shader_group_vote.msl21.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"

#include <metal_stdlib>
#include <simd/simd.h>

using namespace metal;

template<typename T>
inline bool spvSubgroupAllEqual(T value)
{
return simd_all(all(value == simd_broadcast_first(value)));
}

template<>
inline bool spvSubgroupAllEqual(bool value)
{
return simd_all(value) || !simd_any(value);
}

template<uint N>
inline bool spvSubgroupAllEqual(vec<bool, N> value)
{
return simd_all(all(value == (vec<bool, N>)simd_broadcast_first((vec<ushort, N>)value)));
}

struct inputData
{
float inputDataArray[1];
};

constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);

kernel void main0(device inputData& _12 [[buffer(0)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
float thisLaneData = _12.inputDataArray[gl_LocalInvocationID.x];
bool laneActive = thisLaneData > 0.0;
bool allInvocations = simd_all(laneActive);
bool anyInvocations = simd_any(laneActive);
bool allInvocationsEqual = spvSubgroupAllEqual(laneActive);
}

29 changes: 29 additions & 0 deletions shaders-msl/comp/shader_ballot.msl22.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#version 450
#extension GL_ARB_shader_ballot : require

layout (local_size_x = 64) in;
layout (std430, binding = 0) buffer inputData
{
float inputDataArray[];
};

layout (std430, binding = 1) buffer outputData
{
float outputDataArray[];
};

void main ()
{
float thisLaneData = inputDataArray [gl_LocalInvocationID.x];
bool laneActive = (thisLaneData > 0);

uvec4 activeSlots = bitCount(uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), uvec2(0)) & uvec4(unpackUint2x32(ballotARB (laneActive)), uvec2(0)));
uint thisLaneOutputSlot = activeSlots.x + activeSlots.y;

int firstInvocation = readFirstInvocationARB(1);
int invocation = readInvocationARB(1, 0);

if (laneActive) {
outputDataArray[thisLaneOutputSlot] = thisLaneData;
}
}
18 changes: 18 additions & 0 deletions shaders-msl/comp/shader_group_vote.msl21.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#version 450
#extension GL_ARB_shader_group_vote : require

layout (local_size_x = 64) in;
layout (std430, binding = 0) buffer inputData
{
float inputDataArray[];
};

void main ()
{
float thisLaneData = inputDataArray [gl_LocalInvocationID.x];
bool laneActive = (thisLaneData > 0);

bool allInvocations = allInvocationsARB(laneActive);
bool anyInvocations = anyInvocationARB(laneActive);
bool allInvocationsEqual = allInvocationsEqualARB(laneActive);
}
Loading

0 comments on commit 00f14ce

Please sign in to comment.