Skip to content

Commit b7922a4

Browse files
committed
[AMDGPu] Add the support for 45-bit buffer resource
On new targets like `gfx1250`, the buffer resource (V#) now uses this format: ``` base (57-bit): resource[56:0] num_records (45-bit): resource[101:57] reserved (6-bit): resource[107:102] stride (14-bit): resource[121:108] ``` This PR changes the type of `num_records` from `i32` to `i64` in both builtin and intrinsic, and also adds the support for lowering the new format.
1 parent 446a490 commit b7922a4

File tree

20 files changed

+838
-186
lines changed

20 files changed

+838
-186
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
163163
BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
164164
BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc")
165165

166-
BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc")
166+
BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sWii", "nc")
167167
BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vUcQbiiIi", "n")
168168
BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vUsQbiiIi", "n")
169169
BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "vUiQbiiIi", "n")

clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
2525
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
2626
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
27+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
2728
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
28-
// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
29+
// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 [[TMP3]])
2930
// CHECK-NEXT: ret ptr addrspace(8) [[TMP4]]
3031
//
3132
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -48,8 +49,9 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short
4849
// CHECK-NEXT: store i32 [[FLAGS]], ptr [[FLAGS_ADDR_ASCAST]], align 4
4950
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
5051
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
52+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64
5153
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
52-
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
54+
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i64 [[CONV]], i32 [[TMP2]])
5355
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
5456
//
5557
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -73,7 +75,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constan
7375
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
7476
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
7577
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
76-
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
78+
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 1234, i32 [[TMP2]])
7779
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
7880
//
7981
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -97,7 +99,8 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(v
9799
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
98100
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
99101
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
100-
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
102+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
103+
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 5678)
101104
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
102105
//
103106
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {

clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
66
// CHECK-NEXT: entry:
7-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
7+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
8+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
89
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
910
//
1011
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -13,7 +14,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, in
1314

1415
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
1516
// CHECK-NEXT: entry:
16-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
17+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
18+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
1719
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
1820
//
1921
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -22,7 +24,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p,
2224

2325
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
2426
// CHECK-NEXT: entry:
25-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
27+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
2628
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
2729
//
2830
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -31,7 +33,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, sho
3133

3234
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
3335
// CHECK-NEXT: entry:
34-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
36+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
37+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
3538
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
3639
//
3740
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
@@ -40,7 +43,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, s
4043

4144
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
4245
// CHECK-NEXT: entry:
43-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
46+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
47+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
4448
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
4549
//
4650
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
@@ -49,7 +53,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short str
4953

5054
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
5155
// CHECK-NEXT: entry:
52-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
56+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
57+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
5358
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
5459
//
5560
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
@@ -58,7 +63,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global vo
5863

5964
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
6065
// CHECK-NEXT: entry:
61-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
66+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
6267
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
6368
//
6469
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
@@ -67,7 +72,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void
6772

6873
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
6974
// CHECK-NEXT: entry:
70-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
75+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
76+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
7177
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
7278
//
7379
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
@@ -76,7 +82,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global voi
7682

7783
// CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
7884
// CHECK-NEXT: entry:
79-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
85+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
86+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
8087
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
8188
//
8289
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
@@ -85,7 +92,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num,
8592

8693
// CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
8794
// CHECK-NEXT: entry:
88-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
95+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
96+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
8997
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
9098
//
9199
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1431,7 +1431,7 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
14311431
[llvm_anyptr_ty],
14321432
[llvm_anyptr_ty, // base
14331433
llvm_i16_ty, // stride (and swizzle control)
1434-
llvm_i32_ty, // NumRecords / extent
1434+
llvm_i64_ty, // NumRecords / extent
14351435
llvm_i32_ty], // flags
14361436
// Attributes lifted from ptrmask + some extra argument attributes.
14371437
[IntrNoMem, ReadNone<ArgIndex<0>>,

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1443,6 +1443,12 @@ def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic
14431443
"Has LDS barrier-arrive atomic instructions"
14441444
>;
14451445

1446+
def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource",
1447+
"Has45BitNumRecordsBufferResource",
1448+
"true",
1449+
"The buffer resource (V#) supports 45-bit num_records"
1450+
>;
1451+
14461452
// Dummy feature used to disable assembler instructions.
14471453
def FeatureDisable : SubtargetFeature<"",
14481454
"FeatureDisable","true",
@@ -2106,6 +2112,7 @@ def FeatureISAVersion12_50 : FeatureSet<
21062112
FeatureMadU32Inst,
21072113
FeatureLdsBarrierArriveAtomic,
21082114
FeatureSetPrioIncWgInst,
2115+
Feature45BitNumRecordsBufferResource,
21092116
]>;
21102117

21112118
def FeatureISAVersion12_51 : FeatureSet<

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 51 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5905,33 +5905,64 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
59055905
Register Flags = MI.getOperand(5).getReg();
59065906

59075907
LLT S32 = LLT::scalar(32);
5908+
LLT S64 = LLT::scalar(64);
59085909

59095910
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5910-
auto Unmerge = B.buildUnmerge(S32, Pointer);
5911-
Register LowHalf = Unmerge.getReg(0);
5912-
Register HighHalf = Unmerge.getReg(1);
5913-
5914-
auto AndMask = B.buildConstant(S32, 0x0000ffff);
5915-
auto Masked = B.buildAnd(S32, HighHalf, AndMask);
59165911

5917-
MachineInstrBuilder NewHighHalf = Masked;
5918-
std::optional<ValueAndVReg> StrideConst =
5919-
getIConstantVRegValWithLookThrough(Stride, MRI);
5920-
if (!StrideConst || !StrideConst->Value.isZero()) {
5912+
if (ST.has45BitNumRecordsBufferResource()) {
5913+
// Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
5914+
// num_records.
5915+
LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
5916+
auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
5917+
auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
5918+
auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
5919+
Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
5920+
5921+
// Build the higher 64-bit value, which has the higher 38-bit num_records,
5922+
// 6-bit zero (omit), 14-bit stride and 6-bit zero (omit).
5923+
auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
59215924
MachineInstrBuilder ShiftedStride;
5922-
if (StrideConst) {
5923-
uint32_t StrideVal = StrideConst->Value.getZExtValue();
5924-
uint32_t ShiftedStrideVal = StrideVal << 16;
5925-
ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5925+
if (std::optional<ValueAndVReg> StrideConst =
5926+
getIConstantVRegValWithLookThrough(Stride, MRI)) {
5927+
ShiftedStride =
5928+
B.buildConstant(S64, StrideConst->Value.getZExtValue()
5929+
? StrideConst->Value.getZExtValue() << 44
5930+
: 0);
59265931
} else {
5927-
auto ExtStride = B.buildAnyExt(S32, Stride);
5928-
auto ShiftConst = B.buildConstant(S32, 16);
5929-
ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5932+
auto ExtStride = B.buildAnyExt(S64, Stride);
5933+
ShiftedStride = B.buildShl(S64, ExtStride, B.buildConstant(S32, 44));
59305934
}
5931-
NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5935+
Register HighHalf = B.buildOr(S64, NumRecordsRHS, ShiftedStride).getReg(0);
5936+
B.buildMergeValues(Result, {LowHalf, HighHalf});
5937+
} else {
5938+
NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
5939+
auto Unmerge = B.buildUnmerge(S32, Pointer);
5940+
auto LowHalf = Unmerge.getReg(0);
5941+
auto HighHalf = Unmerge.getReg(1);
5942+
5943+
auto AndMask = B.buildConstant(S32, 0x0000ffff);
5944+
auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5945+
5946+
MachineInstrBuilder NewHighHalf = Masked;
5947+
std::optional<ValueAndVReg> StrideConst =
5948+
getIConstantVRegValWithLookThrough(Stride, MRI);
5949+
if (!StrideConst || !StrideConst->Value.isZero()) {
5950+
MachineInstrBuilder ShiftedStride;
5951+
if (StrideConst) {
5952+
uint32_t StrideVal = StrideConst->Value.getZExtValue();
5953+
uint32_t ShiftedStrideVal = StrideVal << 16;
5954+
ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5955+
} else {
5956+
auto ExtStride = B.buildAnyExt(S32, Stride);
5957+
auto ShiftConst = B.buildConstant(S32, 16);
5958+
ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5959+
}
5960+
NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5961+
}
5962+
Register NewHighHalfReg = NewHighHalf.getReg(0);
5963+
B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
59325964
}
5933-
Register NewHighHalfReg = NewHighHalf.getReg(0);
5934-
B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5965+
59355966
MI.eraseFromParent();
59365967
return true;
59375968
}

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
285285
bool UseBlockVGPROpsForCSR = false;
286286
bool HasGloballyAddressableScratch = false;
287287

288+
bool Has45BitNumRecordsBufferResource = false;
289+
288290
// Dummy feature to use for assembler in tablegen.
289291
bool FeatureDisable = false;
290292

@@ -1849,6 +1851,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18491851
return 4;
18501852
return 3;
18511853
}
1854+
1855+
/// \returns true if the sub-target supports buffer resource (V#) with 45-bit
1856+
/// num_records.
1857+
bool has45BitNumRecordsBufferResource() const {
1858+
return Has45BitNumRecordsBufferResource;
1859+
}
18521860
};
18531861

18541862
class GCNUserSGPRUsageInfo {

0 commit comments

Comments
 (0)