-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU] Add the support for 45-bit buffer resource #159702
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-mlir-gpu @llvm/pr-subscribers-llvm-ir Author: Shilei Tian (shiltian) ChangesOn new targets like
This PR changes the type of Patch is 120.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159702.diff 20 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 32b5aa5ac1377..3e45c04687a64 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -163,7 +163,7 @@ BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc")
-BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc")
+BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sWii", "nc")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vUcQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vUsQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "vUiQbiiIi", "n")
diff --git a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
index 2342fcefb5f89..e92105091712c 100644
--- a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
+++ b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
@@ -24,8 +24,9 @@
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 [[TMP3]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP4]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -48,8 +49,9 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short
// CHECK-NEXT: store i32 [[FLAGS]], ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i64 [[CONV]], i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -73,7 +75,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constan
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 1234, i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -97,7 +99,8 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(v
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
index 29093c09c39d0..4b5232c0010aa 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
@@ -4,7 +4,8 @@
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -13,7 +14,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, in
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -22,7 +24,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p,
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -31,7 +33,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, sho
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
@@ -40,7 +43,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, s
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
@@ -49,7 +53,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short str
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
@@ -58,7 +63,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global vo
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
@@ -67,7 +72,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
@@ -76,7 +82,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global voi
// CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
@@ -85,7 +92,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num,
// CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index afce1fe6af854..be965d8ead6fc 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1431,7 +1431,7 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
[llvm_anyptr_ty],
[llvm_anyptr_ty, // base
llvm_i16_ty, // stride (and swizzle control)
- llvm_i32_ty, // NumRecords / extent
+ llvm_i64_ty, // NumRecords / extent
llvm_i32_ty], // flags
// Attributes lifted from ptrmask + some extra argument attributes.
[IntrNoMem, ReadNone<ArgIndex<0>>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 23339b2ad228e..b2d1011eb506c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1443,6 +1443,12 @@ def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic
"Has LDS barrier-arrive atomic instructions"
>;
+def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource",
+ "Has45BitNumRecordsBufferResource",
+ "true",
+ "The buffer resource (V#) supports 45-bit num_records"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
@@ -2106,6 +2112,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureMadU32Inst,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
+ Feature45BitNumRecordsBufferResource,
]>;
def FeatureISAVersion12_51 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c690b2b7129b4..c588fcf1ff31d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5905,33 +5905,64 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
Register Flags = MI.getOperand(5).getReg();
LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- auto Unmerge = B.buildUnmerge(S32, Pointer);
- Register LowHalf = Unmerge.getReg(0);
- Register HighHalf = Unmerge.getReg(1);
-
- auto AndMask = B.buildConstant(S32, 0x0000ffff);
- auto Masked = B.buildAnd(S32, HighHalf, AndMask);
- MachineInstrBuilder NewHighHalf = Masked;
- std::optional<ValueAndVReg> StrideConst =
- getIConstantVRegValWithLookThrough(Stride, MRI);
- if (!StrideConst || !StrideConst->Value.isZero()) {
+ if (ST.has45BitNumRecordsBufferResource()) {
+ // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
+ // num_records.
+ LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
+ auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
+ auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
+ auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
+ Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
+
+ // Build the higher 64-bit value, which has the higher 38-bit num_records,
+ // 6-bit zero (omit), 14-bit stride and 6-bit zero (omit).
+ auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
MachineInstrBuilder ShiftedStride;
- if (StrideConst) {
- uint32_t StrideVal = StrideConst->Value.getZExtValue();
- uint32_t ShiftedStrideVal = StrideVal << 16;
- ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ if (std::optional<ValueAndVReg> StrideConst =
+ getIConstantVRegValWithLookThrough(Stride, MRI)) {
+ ShiftedStride =
+ B.buildConstant(S64, StrideConst->Value.getZExtValue()
+ ? StrideConst->Value.getZExtValue() << 44
+ : 0);
} else {
- auto ExtStride = B.buildAnyExt(S32, Stride);
- auto ShiftConst = B.buildConstant(S32, 16);
- ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ auto ExtStride = B.buildAnyExt(S64, Stride);
+ ShiftedStride = B.buildShl(S64, ExtStride, B.buildConstant(S32, 44));
}
- NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ Register HighHalf = B.buildOr(S64, NumRecordsRHS, ShiftedStride).getReg(0);
+ B.buildMergeValues(Result, {LowHalf, HighHalf});
+ } else {
+ NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
+ auto Unmerge = B.buildUnmerge(S32, Pointer);
+ auto LowHalf = Unmerge.getReg(0);
+ auto HighHalf = Unmerge.getReg(1);
+
+ auto AndMask = B.buildConstant(S32, 0x0000ffff);
+ auto Masked = B.buildAnd(S32, HighHalf, AndMask);
+
+ MachineInstrBuilder NewHighHalf = Masked;
+ std::optional<ValueAndVReg> StrideConst =
+ getIConstantVRegValWithLookThrough(Stride, MRI);
+ if (!StrideConst || !StrideConst->Value.isZero()) {
+ MachineInstrBuilder ShiftedStride;
+ if (StrideConst) {
+ uint32_t StrideVal = StrideConst->Value.getZExtValue();
+ uint32_t ShiftedStrideVal = StrideVal << 16;
+ ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ } else {
+ auto ExtStride = B.buildAnyExt(S32, Stride);
+ auto ShiftConst = B.buildConstant(S32, 16);
+ ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ }
+ NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ }
+ Register NewHighHalfReg = NewHighHalf.getReg(0);
+ B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
}
- Register NewHighHalfReg = NewHighHalf.getReg(0);
- B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
+
MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 920a47b5afe07..f5367f3b88920 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -285,6 +285,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool UseBlockVGPROpsForCSR = false;
bool HasGloballyAddressableScratch = false;
+ bool Has45BitNumRecordsBufferResource = false;
+
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
@@ -1849,6 +1851,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return 4;
return 3;
}
+
+ /// \returns true if the sub-target supports buffer resource (V#) with 45-bit
+ /// num_records.
+ bool has45BitNumRecordsBufferResource() const {
+ return Has45BitNumRecordsBufferResource;
+ }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 363717b017ef0..f0cea5f225881 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11597,29 +11597,70 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
SDValue NumRecords = Op->getOperand(3);
SDValue Flags = Op->getOperand(4);
- auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
- SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
- SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
- std::optional<uint32_t> ConstStride = std::nullopt;
- if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
- ConstStride = ConstNode->getZExtValue();
-
- SDValue NewHighHalf = Masked;
- if (!ConstStride || *ConstStride != 0) {
+ auto GetConstStride = [Stride]() -> std::optional<uint32_t> {
+ if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
+ return ConstNode->getZExtValue();
+ return std::nullopt;
+ };
+
+ SDValue Rsrc;
+
+ if (Subtarget->has45BitNumRecordsBufferResource()) {
+ // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
+ // num_records.
+ SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
+ SDValue NumRecordsLHS =
+ DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
+ DAG.getShiftAmountConstant(57, MVT::i32, Loc));
+ SDValue LowHalf =
+ DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
+
+ // Build the higher 64-bit value, which has the higher 38-bit num_records,
+ // 6-bit zero (omit), 14-bit stride and 6-bit zero (omit).
+ SDValue NumRecordsRHS =
+ DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
+ DAG.getShiftAmountConstant(7, MVT::i32, Loc));
SDValue ShiftedStride;
- if (ConstStride) {
- ShiftedStride = DAG.getConstant(*ConstStride << 16, Lo...
[truncated]
|
@llvm/pr-subscribers-clang Author: Shilei Tian (shiltian) ChangesOn new targets like
This PR changes the type of Patch is 120.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159702.diff 20 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 32b5aa5ac1377..3e45c04687a64 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -163,7 +163,7 @@ BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc")
-BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc")
+BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sWii", "nc")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vUcQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vUsQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "vUiQbiiIi", "n")
diff --git a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
index 2342fcefb5f89..e92105091712c 100644
--- a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
+++ b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
@@ -24,8 +24,9 @@
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 [[TMP3]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP4]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -48,8 +49,9 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short
// CHECK-NEXT: store i32 [[FLAGS]], ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i64 [[CONV]], i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -73,7 +75,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constan
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 1234, i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -97,7 +99,8 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(v
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
index 29093c09c39d0..4b5232c0010aa 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
@@ -4,7 +4,8 @@
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -13,7 +14,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, in
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -22,7 +24,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p,
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -31,7 +33,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, sho
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
@@ -40,7 +43,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, s
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
@@ -49,7 +53,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short str
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
@@ -58,7 +63,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global vo
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
@@ -67,7 +72,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
@@ -76,7 +82,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global voi
// CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
@@ -85,7 +92,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num,
// CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index afce1fe6af854..be965d8ead6fc 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1431,7 +1431,7 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
[llvm_anyptr_ty],
[llvm_anyptr_ty, // base
llvm_i16_ty, // stride (and swizzle control)
- llvm_i32_ty, // NumRecords / extent
+ llvm_i64_ty, // NumRecords / extent
llvm_i32_ty], // flags
// Attributes lifted from ptrmask + some extra argument attributes.
[IntrNoMem, ReadNone<ArgIndex<0>>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 23339b2ad228e..b2d1011eb506c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1443,6 +1443,12 @@ def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic
"Has LDS barrier-arrive atomic instructions"
>;
+def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource",
+ "Has45BitNumRecordsBufferResource",
+ "true",
+ "The buffer resource (V#) supports 45-bit num_records"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
@@ -2106,6 +2112,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureMadU32Inst,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
+ Feature45BitNumRecordsBufferResource,
]>;
def FeatureISAVersion12_51 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c690b2b7129b4..c588fcf1ff31d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5905,33 +5905,64 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
Register Flags = MI.getOperand(5).getReg();
LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- auto Unmerge = B.buildUnmerge(S32, Pointer);
- Register LowHalf = Unmerge.getReg(0);
- Register HighHalf = Unmerge.getReg(1);
-
- auto AndMask = B.buildConstant(S32, 0x0000ffff);
- auto Masked = B.buildAnd(S32, HighHalf, AndMask);
- MachineInstrBuilder NewHighHalf = Masked;
- std::optional<ValueAndVReg> StrideConst =
- getIConstantVRegValWithLookThrough(Stride, MRI);
- if (!StrideConst || !StrideConst->Value.isZero()) {
+ if (ST.has45BitNumRecordsBufferResource()) {
+ // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
+ // num_records.
+ LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
+ auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
+ auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
+ auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
+ Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
+
+ // Build the higher 64-bit value, which has the higher 38-bit num_records,
+ // 6-bit zero (omit), 14-bit stride and 6-bit zero (omit).
+ auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
MachineInstrBuilder ShiftedStride;
- if (StrideConst) {
- uint32_t StrideVal = StrideConst->Value.getZExtValue();
- uint32_t ShiftedStrideVal = StrideVal << 16;
- ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ if (std::optional<ValueAndVReg> StrideConst =
+ getIConstantVRegValWithLookThrough(Stride, MRI)) {
+ ShiftedStride =
+ B.buildConstant(S64, StrideConst->Value.getZExtValue()
+ ? StrideConst->Value.getZExtValue() << 44
+ : 0);
} else {
- auto ExtStride = B.buildAnyExt(S32, Stride);
- auto ShiftConst = B.buildConstant(S32, 16);
- ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ auto ExtStride = B.buildAnyExt(S64, Stride);
+ ShiftedStride = B.buildShl(S64, ExtStride, B.buildConstant(S32, 44));
}
- NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ Register HighHalf = B.buildOr(S64, NumRecordsRHS, ShiftedStride).getReg(0);
+ B.buildMergeValues(Result, {LowHalf, HighHalf});
+ } else {
+ NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
+ auto Unmerge = B.buildUnmerge(S32, Pointer);
+ auto LowHalf = Unmerge.getReg(0);
+ auto HighHalf = Unmerge.getReg(1);
+
+ auto AndMask = B.buildConstant(S32, 0x0000ffff);
+ auto Masked = B.buildAnd(S32, HighHalf, AndMask);
+
+ MachineInstrBuilder NewHighHalf = Masked;
+ std::optional<ValueAndVReg> StrideConst =
+ getIConstantVRegValWithLookThrough(Stride, MRI);
+ if (!StrideConst || !StrideConst->Value.isZero()) {
+ MachineInstrBuilder ShiftedStride;
+ if (StrideConst) {
+ uint32_t StrideVal = StrideConst->Value.getZExtValue();
+ uint32_t ShiftedStrideVal = StrideVal << 16;
+ ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ } else {
+ auto ExtStride = B.buildAnyExt(S32, Stride);
+ auto ShiftConst = B.buildConstant(S32, 16);
+ ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ }
+ NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ }
+ Register NewHighHalfReg = NewHighHalf.getReg(0);
+ B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
}
- Register NewHighHalfReg = NewHighHalf.getReg(0);
- B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
+
MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 920a47b5afe07..f5367f3b88920 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -285,6 +285,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool UseBlockVGPROpsForCSR = false;
bool HasGloballyAddressableScratch = false;
+ bool Has45BitNumRecordsBufferResource = false;
+
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
@@ -1849,6 +1851,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return 4;
return 3;
}
+
+ /// \returns true if the sub-target supports buffer resource (V#) with 45-bit
+ /// num_records.
+ bool has45BitNumRecordsBufferResource() const {
+ return Has45BitNumRecordsBufferResource;
+ }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 363717b017ef0..f0cea5f225881 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11597,29 +11597,70 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
SDValue NumRecords = Op->getOperand(3);
SDValue Flags = Op->getOperand(4);
- auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
- SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
- SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
- std::optional<uint32_t> ConstStride = std::nullopt;
- if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
- ConstStride = ConstNode->getZExtValue();
-
- SDValue NewHighHalf = Masked;
- if (!ConstStride || *ConstStride != 0) {
+ auto GetConstStride = [Stride]() -> std::optional<uint32_t> {
+ if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
+ return ConstNode->getZExtValue();
+ return std::nullopt;
+ };
+
+ SDValue Rsrc;
+
+ if (Subtarget->has45BitNumRecordsBufferResource()) {
+ // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
+ // num_records.
+ SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
+ SDValue NumRecordsLHS =
+ DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
+ DAG.getShiftAmountConstant(57, MVT::i32, Loc));
+ SDValue LowHalf =
+ DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
+
+ // Build the higher 64-bit value, which has the higher 38-bit num_records,
+ // 6-bit zero (omit), 14-bit stride and 6-bit zero (omit).
+ SDValue NumRecordsRHS =
+ DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
+ DAG.getShiftAmountConstant(7, MVT::i32, Loc));
SDValue ShiftedStride;
- if (ConstStride) {
- ShiftedStride = DAG.getConstant(*ConstStride << 16, Lo...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Shilei Tian (shiltian) ChangesOn new targets like
This PR changes the type of Patch is 120.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159702.diff 20 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 32b5aa5ac1377..3e45c04687a64 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -163,7 +163,7 @@ BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc")
-BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc")
+BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sWii", "nc")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vUcQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vUsQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "vUiQbiiIi", "n")
diff --git a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
index 2342fcefb5f89..e92105091712c 100644
--- a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
+++ b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
@@ -24,8 +24,9 @@
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 [[TMP3]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP4]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -48,8 +49,9 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short
// CHECK-NEXT: store i32 [[FLAGS]], ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i64 [[CONV]], i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -73,7 +75,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constan
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 1234, i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -97,7 +99,8 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(v
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
index 29093c09c39d0..4b5232c0010aa 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
@@ -4,7 +4,8 @@
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -13,7 +14,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, in
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -22,7 +24,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p,
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -31,7 +33,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, sho
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
@@ -40,7 +43,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, s
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
@@ -49,7 +53,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short str
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
@@ -58,7 +63,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global vo
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
@@ -67,7 +72,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
@@ -76,7 +82,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global voi
// CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
@@ -85,7 +92,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num,
// CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index afce1fe6af854..be965d8ead6fc 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1431,7 +1431,7 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
[llvm_anyptr_ty],
[llvm_anyptr_ty, // base
llvm_i16_ty, // stride (and swizzle control)
- llvm_i32_ty, // NumRecords / extent
+ llvm_i64_ty, // NumRecords / extent
llvm_i32_ty], // flags
// Attributes lifted from ptrmask + some extra argument attributes.
[IntrNoMem, ReadNone<ArgIndex<0>>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 23339b2ad228e..b2d1011eb506c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1443,6 +1443,12 @@ def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic
"Has LDS barrier-arrive atomic instructions"
>;
+def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource",
+ "Has45BitNumRecordsBufferResource",
+ "true",
+ "The buffer resource (V#) supports 45-bit num_records"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
@@ -2106,6 +2112,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureMadU32Inst,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
+ Feature45BitNumRecordsBufferResource,
]>;
def FeatureISAVersion12_51 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c690b2b7129b4..c588fcf1ff31d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5905,33 +5905,64 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
Register Flags = MI.getOperand(5).getReg();
LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- auto Unmerge = B.buildUnmerge(S32, Pointer);
- Register LowHalf = Unmerge.getReg(0);
- Register HighHalf = Unmerge.getReg(1);
-
- auto AndMask = B.buildConstant(S32, 0x0000ffff);
- auto Masked = B.buildAnd(S32, HighHalf, AndMask);
- MachineInstrBuilder NewHighHalf = Masked;
- std::optional<ValueAndVReg> StrideConst =
- getIConstantVRegValWithLookThrough(Stride, MRI);
- if (!StrideConst || !StrideConst->Value.isZero()) {
+ if (ST.has45BitNumRecordsBufferResource()) {
+ // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
+ // num_records.
+ LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
+ auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
+ auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
+ auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
+ Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
+
+ // Build the higher 64-bit value, which has the higher 38-bit num_records,
+ // 6-bit zero (omit), 14-bit stride and 6-bit zero (omit).
+ auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
MachineInstrBuilder ShiftedStride;
- if (StrideConst) {
- uint32_t StrideVal = StrideConst->Value.getZExtValue();
- uint32_t ShiftedStrideVal = StrideVal << 16;
- ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ if (std::optional<ValueAndVReg> StrideConst =
+ getIConstantVRegValWithLookThrough(Stride, MRI)) {
+ ShiftedStride =
+ B.buildConstant(S64, StrideConst->Value.getZExtValue()
+ ? StrideConst->Value.getZExtValue() << 44
+ : 0);
} else {
- auto ExtStride = B.buildAnyExt(S32, Stride);
- auto ShiftConst = B.buildConstant(S32, 16);
- ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ auto ExtStride = B.buildAnyExt(S64, Stride);
+ ShiftedStride = B.buildShl(S64, ExtStride, B.buildConstant(S32, 44));
}
- NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ Register HighHalf = B.buildOr(S64, NumRecordsRHS, ShiftedStride).getReg(0);
+ B.buildMergeValues(Result, {LowHalf, HighHalf});
+ } else {
+ NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
+ auto Unmerge = B.buildUnmerge(S32, Pointer);
+ auto LowHalf = Unmerge.getReg(0);
+ auto HighHalf = Unmerge.getReg(1);
+
+ auto AndMask = B.buildConstant(S32, 0x0000ffff);
+ auto Masked = B.buildAnd(S32, HighHalf, AndMask);
+
+ MachineInstrBuilder NewHighHalf = Masked;
+ std::optional<ValueAndVReg> StrideConst =
+ getIConstantVRegValWithLookThrough(Stride, MRI);
+ if (!StrideConst || !StrideConst->Value.isZero()) {
+ MachineInstrBuilder ShiftedStride;
+ if (StrideConst) {
+ uint32_t StrideVal = StrideConst->Value.getZExtValue();
+ uint32_t ShiftedStrideVal = StrideVal << 16;
+ ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ } else {
+ auto ExtStride = B.buildAnyExt(S32, Stride);
+ auto ShiftConst = B.buildConstant(S32, 16);
+ ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ }
+ NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ }
+ Register NewHighHalfReg = NewHighHalf.getReg(0);
+ B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
}
- Register NewHighHalfReg = NewHighHalf.getReg(0);
- B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
+
MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 920a47b5afe07..f5367f3b88920 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -285,6 +285,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool UseBlockVGPROpsForCSR = false;
bool HasGloballyAddressableScratch = false;
+ bool Has45BitNumRecordsBufferResource = false;
+
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
@@ -1849,6 +1851,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return 4;
return 3;
}
+
+ /// \returns true if the sub-target supports buffer resource (V#) with 45-bit
+ /// num_records.
+ bool has45BitNumRecordsBufferResource() const {
+ return Has45BitNumRecordsBufferResource;
+ }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 363717b017ef0..f0cea5f225881 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11597,29 +11597,70 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
SDValue NumRecords = Op->getOperand(3);
SDValue Flags = Op->getOperand(4);
- auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
- SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
- SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
- std::optional<uint32_t> ConstStride = std::nullopt;
- if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
- ConstStride = ConstNode->getZExtValue();
-
- SDValue NewHighHalf = Masked;
- if (!ConstStride || *ConstStride != 0) {
+ auto GetConstStride = [Stride]() -> std::optional<uint32_t> {
+ if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
+ return ConstNode->getZExtValue();
+ return std::nullopt;
+ };
+
+ SDValue Rsrc;
+
+ if (Subtarget->has45BitNumRecordsBufferResource()) {
+ // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
+ // num_records.
+ SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
+ SDValue NumRecordsLHS =
+ DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
+ DAG.getShiftAmountConstant(57, MVT::i32, Loc));
+ SDValue LowHalf =
+ DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
+
+ // Build the higher 64-bit value, which has the higher 38-bit num_records,
+ // 6-bit zero (omit), 14-bit stride and 6-bit zero (omit).
+ SDValue NumRecordsRHS =
+ DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
+ DAG.getShiftAmountConstant(7, MVT::i32, Loc));
SDValue ShiftedStride;
- if (ConstStride) {
- ShiftedStride = DAG.getConstant(*ConstStride << 16, Lo...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: Shilei Tian (shiltian) ChangesOn new targets like
This PR changes the type of Patch is 120.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159702.diff 20 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 32b5aa5ac1377..3e45c04687a64 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -163,7 +163,7 @@ BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc")
-BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc")
+BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sWii", "nc")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vUcQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vUsQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "vUiQbiiIi", "n")
diff --git a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
index 2342fcefb5f89..e92105091712c 100644
--- a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
+++ b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
@@ -24,8 +24,9 @@
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 [[TMP3]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP4]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -48,8 +49,9 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short
// CHECK-NEXT: store i32 [[FLAGS]], ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i64 [[CONV]], i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -73,7 +75,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constan
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 1234, i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -97,7 +99,8 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(v
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
index 29093c09c39d0..4b5232c0010aa 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
@@ -4,7 +4,8 @@
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -13,7 +14,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, in
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -22,7 +24,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p,
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -31,7 +33,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, sho
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
@@ -40,7 +43,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, s
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
@@ -49,7 +53,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short str
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
@@ -58,7 +63,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global vo
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
@@ -67,7 +72,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
@@ -76,7 +82,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global voi
// CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
@@ -85,7 +92,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num,
// CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index afce1fe6af854..be965d8ead6fc 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1431,7 +1431,7 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
[llvm_anyptr_ty],
[llvm_anyptr_ty, // base
llvm_i16_ty, // stride (and swizzle control)
- llvm_i32_ty, // NumRecords / extent
+ llvm_i64_ty, // NumRecords / extent
llvm_i32_ty], // flags
// Attributes lifted from ptrmask + some extra argument attributes.
[IntrNoMem, ReadNone<ArgIndex<0>>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 23339b2ad228e..b2d1011eb506c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1443,6 +1443,12 @@ def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic
"Has LDS barrier-arrive atomic instructions"
>;
+def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource",
+ "Has45BitNumRecordsBufferResource",
+ "true",
+ "The buffer resource (V#) supports 45-bit num_records"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
@@ -2106,6 +2112,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureMadU32Inst,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
+ Feature45BitNumRecordsBufferResource,
]>;
def FeatureISAVersion12_51 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c690b2b7129b4..c588fcf1ff31d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5905,33 +5905,64 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
Register Flags = MI.getOperand(5).getReg();
LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- auto Unmerge = B.buildUnmerge(S32, Pointer);
- Register LowHalf = Unmerge.getReg(0);
- Register HighHalf = Unmerge.getReg(1);
-
- auto AndMask = B.buildConstant(S32, 0x0000ffff);
- auto Masked = B.buildAnd(S32, HighHalf, AndMask);
- MachineInstrBuilder NewHighHalf = Masked;
- std::optional<ValueAndVReg> StrideConst =
- getIConstantVRegValWithLookThrough(Stride, MRI);
- if (!StrideConst || !StrideConst->Value.isZero()) {
+ if (ST.has45BitNumRecordsBufferResource()) {
+ // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
+ // num_records.
+ LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
+ auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
+ auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
+ auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
+ Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
+
+ // Build the higher 64-bit value, which has the higher 38-bit num_records,
+ // 6-bit zero (omit), 14-bit stride and 6-bit zero (omit).
+ auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
MachineInstrBuilder ShiftedStride;
- if (StrideConst) {
- uint32_t StrideVal = StrideConst->Value.getZExtValue();
- uint32_t ShiftedStrideVal = StrideVal << 16;
- ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ if (std::optional<ValueAndVReg> StrideConst =
+ getIConstantVRegValWithLookThrough(Stride, MRI)) {
+ ShiftedStride =
+ B.buildConstant(S64, StrideConst->Value.getZExtValue()
+ ? StrideConst->Value.getZExtValue() << 44
+ : 0);
} else {
- auto ExtStride = B.buildAnyExt(S32, Stride);
- auto ShiftConst = B.buildConstant(S32, 16);
- ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ auto ExtStride = B.buildAnyExt(S64, Stride);
+ ShiftedStride = B.buildShl(S64, ExtStride, B.buildConstant(S32, 44));
}
- NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ Register HighHalf = B.buildOr(S64, NumRecordsRHS, ShiftedStride).getReg(0);
+ B.buildMergeValues(Result, {LowHalf, HighHalf});
+ } else {
+ NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
+ auto Unmerge = B.buildUnmerge(S32, Pointer);
+ auto LowHalf = Unmerge.getReg(0);
+ auto HighHalf = Unmerge.getReg(1);
+
+ auto AndMask = B.buildConstant(S32, 0x0000ffff);
+ auto Masked = B.buildAnd(S32, HighHalf, AndMask);
+
+ MachineInstrBuilder NewHighHalf = Masked;
+ std::optional<ValueAndVReg> StrideConst =
+ getIConstantVRegValWithLookThrough(Stride, MRI);
+ if (!StrideConst || !StrideConst->Value.isZero()) {
+ MachineInstrBuilder ShiftedStride;
+ if (StrideConst) {
+ uint32_t StrideVal = StrideConst->Value.getZExtValue();
+ uint32_t ShiftedStrideVal = StrideVal << 16;
+ ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ } else {
+ auto ExtStride = B.buildAnyExt(S32, Stride);
+ auto ShiftConst = B.buildConstant(S32, 16);
+ ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ }
+ NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ }
+ Register NewHighHalfReg = NewHighHalf.getReg(0);
+ B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
}
- Register NewHighHalfReg = NewHighHalf.getReg(0);
- B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
+
MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 920a47b5afe07..f5367f3b88920 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -285,6 +285,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool UseBlockVGPROpsForCSR = false;
bool HasGloballyAddressableScratch = false;
+ bool Has45BitNumRecordsBufferResource = false;
+
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
@@ -1849,6 +1851,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return 4;
return 3;
}
+
+ /// \returns true if the sub-target supports buffer resource (V#) with 45-bit
+ /// num_records.
+ bool has45BitNumRecordsBufferResource() const {
+ return Has45BitNumRecordsBufferResource;
+ }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 363717b017ef0..f0cea5f225881 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11597,29 +11597,70 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
SDValue NumRecords = Op->getOperand(3);
SDValue Flags = Op->getOperand(4);
- auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
- SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
- SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
- std::optional<uint32_t> ConstStride = std::nullopt;
- if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
- ConstStride = ConstNode->getZExtValue();
-
- SDValue NewHighHalf = Masked;
- if (!ConstStride || *ConstStride != 0) {
+ auto GetConstStride = [Stride]() -> std::optional<uint32_t> {
+ if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
+ return ConstNode->getZExtValue();
+ return std::nullopt;
+ };
+
+ SDValue Rsrc;
+
+ if (Subtarget->has45BitNumRecordsBufferResource()) {
+ // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
+ // num_records.
+ SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
+ SDValue NumRecordsLHS =
+ DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
+ DAG.getShiftAmountConstant(57, MVT::i32, Loc));
+ SDValue LowHalf =
+ DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
+
+ // Build the higher 64-bit value, which has the higher 38-bit num_records,
+ // 6-bit zero (omit), 14-bit stride and 6-bit zero (omit).
+ SDValue NumRecordsRHS =
+ DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
+ DAG.getShiftAmountConstant(7, MVT::i32, Loc));
SDValue ShiftedStride;
- if (ConstStride) {
- ShiftedStride = DAG.getConstant(*ConstStride << 16, Lo...
[truncated]
|
b7922a4
to
d488e62
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not strongly tied to variadic num_records aside from it can lead to less-great codegen where you compute a 64-bit value and have a really late truncation.
More fundamentally, we're not plumbing flags
through on gfx1250 (for example, graphics needs those for swizzles).
The hold is specifically on flags support
d488e62
to
837979b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor questions about the final IR, otherwise lgtm here
On new targets like `gfx1250`, the buffer resource (V#) now uses this format: ``` base (57-bit): resource[56:0] num_records (45-bit): resource[101:57] reserved (6-bit): resource[107:102] stride (14-bit): resource[121:108] ``` This PR changes the type of `num_records` from `i32` to `i64` in both builtin and intrinsic, and also adds the support for lowering the new format.
837979b
to
5c1026d
Compare
I don't really know how to update the test case |
Re the failing test, that'll require surgery on Since this is a |
That'd be great. Thanks! Just gave it some plumbing but it didn't quite work. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Approved here, though we might want another MLIR reviewer
Feel free to add one if you like. |
assert(size < std::numeric_limits<uint32_t>::max() && | ||
"the memref buffer is too large"); | ||
return createI32Constant(rewriter, loc, static_cast<int32_t>(size)); | ||
return createI64Constant(rewriter, loc, static_cast<int32_t>(size)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we at least assert this is in range for int32?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, yeah, we should fail for non-i32 on pre-gfx1250, yeah
I can do that
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I removed the pointless static cast so we at least keep everything in int64_t
If I wanted to check for the "this'll get silently truncated to 32 bits" case, I'd need to plumb the chipset through. Should I do that?
On new targets like
gfx1250
, the buffer resource (V#) now uses this format:This PR changes the type of
num_records
fromi32
toi64
in both builtin and intrinsic, and also adds the support for lowering the new format.Fixes SWDEV-554034.