diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index e438451697a0d..7f4b9817f7727 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -731,6 +731,7 @@ def __builtin_amdgcn_s_barrier_leave : AMDGPUBuiltin<"void(_Constant short)", [] def __builtin_amdgcn_s_get_barrier_state : AMDGPUBuiltin<"unsigned int(int)", [], "gfx12-insts">; def __builtin_amdgcn_s_get_named_barrier_state : AMDGPUBuiltin<"unsigned int(void *)", [], "gfx12-insts">; def __builtin_amdgcn_s_prefetch_data : AMDGPUBuiltin<"void(void const *, unsigned int)", [Const], "gfx12-insts">; +def __builtin_amdgcn_s_prefetch_inst : AMDGPUBuiltin<"void(void const *, unsigned int)", [Const], "gfx12-insts">; def __builtin_amdgcn_s_buffer_prefetch_data : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, _Constant int, unsigned int)", [Const], "gfx12-insts">; def __builtin_amdgcn_global_load_tr_b64_v2i32 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize32">; diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp index 0a7ba0c194400..92b16372d262f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp @@ -1029,6 +1029,12 @@ CIRGenFunction::emitAMDGPUBuiltinExpr(unsigned builtinId, getContext().BuiltinInfo.getName(builtinId)); return mlir::Value{}; } + case AMDGPU::BI__builtin_amdgcn_s_prefetch_inst: { + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented AMDGPU builtin call: ") + + getContext().BuiltinInfo.getName(builtinId)); + return mlir::Value{}; + } case Builtin::BIlogbf: case Builtin::BI__builtin_logbf: return emitLogbBuiltin(*this, expr, llvm::APFloat::IEEEsingle()); diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 64d1aacfe6e29..0f633cb12ec93 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -2185,6 +2185,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_s_prefetch_data: return emitBuiltinWithOneOverloadedType<2>( *this, E, Intrinsic::amdgcn_s_prefetch_data); + case AMDGPU::BI__builtin_amdgcn_s_prefetch_inst: + return emitBuiltinWithOneOverloadedType<2>( + *this, E, Intrinsic::amdgcn_s_prefetch_inst); case Builtin::BIlogbf: case Builtin::BI__builtin_logbf: { Value *Src0 = EmitScalarExpr(E->getArg(0)); diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-prefetch.hip b/clang/test/CodeGenHIP/builtins-amdgcn-prefetch.hip new file mode 100644 index 0000000000000..4be0262b2e0ce --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-prefetch.hip @@ -0,0 +1,71 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +#define __device__ __attribute__((device)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) + +extern "C" __device__ int bar(const char *s); + +// CHECK-GFX1200-LABEL: define dso_local noundef i32 @_Z4foo1v( +// CHECK-GFX1200-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] +// CHECK-GFX1200-NEXT: [[S:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr +// CHECK-GFX1200-NEXT: call void @llvm.amdgcn.s.prefetch.inst.p0(ptr @bar, i32 0) +// CHECK-GFX1200-NEXT: store ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr [[S_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[CALL:%.*]] = call i32 @bar(ptr noundef [[TMP0]]) #[[ATTR3:[0-9]+]] +// CHECK-GFX1200-NEXT: ret i32 [[CALL]] +// +__device__ int foo1() { + __builtin_amdgcn_s_prefetch_inst((const void *)bar, 0); + const char *s = "hello world"; + return bar(s); +} + +// CHECK-GFX1200-LABEL: define dso_local noundef i32 @_Z4foo2i( +// CHECK-GFX1200-SAME: i32 noundef [[ID:%.*]]) #[[ATTR0]] { +// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] +// CHECK-GFX1200-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-GFX1200-NEXT: [[ID_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-GFX1200-NEXT: [[S:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[S2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-GFX1200-NEXT: [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr +// CHECK-GFX1200-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr +// CHECK-GFX1200-NEXT: [[S2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S2]] to ptr +// CHECK-GFX1200-NEXT: store i32 [[ID]], ptr [[ID_ADDR_ASCAST]], align 4 +// CHECK-GFX1200-NEXT: call void @llvm.amdgcn.s.prefetch.inst.p0(ptr blockaddress(@_Z4foo2i, %[[NOBAR:.*]]), i32 0) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = load i32, ptr [[ID_ADDR_ASCAST]], align 4 +// CHECK-GFX1200-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-GFX1200-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// CHECK-GFX1200: [[IF_THEN]]: +// CHECK-GFX1200-NEXT: store ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr [[S_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[CALL:%.*]] = call i32 @bar(ptr noundef [[TMP1]]) #[[ATTR3]] +// CHECK-GFX1200-NEXT: store i32 [[CALL]], ptr addrspace(5) [[RETVAL]], align 4 +// CHECK-GFX1200-NEXT: br label %[[RETURN:.*]] +// CHECK-GFX1200: [[IF_END]]: +// CHECK-GFX1200-NEXT: br label %[[NOBAR]] +// CHECK-GFX1200: [[NOBAR]]: +// CHECK-GFX1200-NEXT: store ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), ptr [[S2_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[TMP2:%.*]] = load ptr, ptr [[S2_ASCAST]], align 8 +// CHECK-GFX1200-NEXT: [[CALL1:%.*]] = call i32 @bar(ptr noundef [[TMP2]]) #[[ATTR3]] +// CHECK-GFX1200-NEXT: store i32 [[CALL1]], ptr addrspace(5) [[RETVAL]], align 4 +// CHECK-GFX1200-NEXT: br label %[[RETURN]] +// CHECK-GFX1200: [[RETURN]]: +// CHECK-GFX1200-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[RETVAL]], align 4 +// CHECK-GFX1200-NEXT: ret i32 [[TMP3]] +// CHECK-GFX1200: [[INDIRECTGOTO:.*:]] +// CHECK-GFX1200-NEXT: indirectbr ptr poison, [label %[[NOBAR]]] +// +__device__ int foo2(int id) { + __builtin_amdgcn_s_prefetch_inst(&&NOBAR, 0); + if (id == 0) { + const char *s = "hello world"; + return bar(s); + } +NOBAR: + const char *s2 = "skip hello"; + return bar(s2); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl index 332a2fa94ee92..ffd1b152d9ef9 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -247,6 +247,32 @@ void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned __builtin_amdgcn_s_prefetch_data(cp, 31); } +// CHECK-LABEL: @test_s_prefetch_inst( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5) +// CHECK-NEXT: [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store ptr [[FP:%.*]], ptr addrspace(5) [[FP_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[GP:%.*]], ptr addrspace(5) [[GP_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[CP_ADDR]], align 8 +// CHECK-NEXT: store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.inst.p0(ptr [[TMP0]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GP_ADDR]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.inst.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(5) [[CP_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.inst.p4(ptr addrspace(4) [[TMP3]], i32 31) +// CHECK-NEXT: ret void +// +void test_s_prefetch_inst(int *fp, global float *gp, constant char *cp, unsigned int len) +{ + __builtin_amdgcn_s_prefetch_inst(fp, 0); + __builtin_amdgcn_s_prefetch_inst(gp, len); + __builtin_amdgcn_s_prefetch_inst(cp, 31); +} + // CHECK-LABEL: @test_s_buffer_prefetch_data( // CHECK-NEXT: entry: // CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)