Skip to content

Commit b05fec9

Browse files
authored
[flang][cuda] Convert gpu.launch_func to CUFLaunchClusterKernel when cluster dims are present (#113959)
Kernel launch in CUF are converted to `gpu.launch_func`. When the kernel has `cluster_dims` specified these get carried over to the `gpu.launch_func` operation. This patch updates the special conversion of `gpu.launch_func` when cluster dims are present to the newly added entry point.
1 parent 0b700f2 commit b05fec9

File tree

2 files changed

+76
-31
lines changed

2 files changed

+76
-31
lines changed

flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,6 @@ struct GPULaunchKernelConversion
7676
mlir::LogicalResult
7777
matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor,
7878
mlir::ConversionPatternRewriter &rewriter) const override {
79-
80-
if (op.hasClusterSize()) {
81-
return mlir::failure();
82-
}
83-
8479
mlir::Location loc = op.getLoc();
8580
auto *ctx = rewriter.getContext();
8681
mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>();
@@ -107,37 +102,65 @@ struct GPULaunchKernelConversion
107102
rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName());
108103
}
109104

110-
auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
111-
RTNAME_STRING(CUFLaunchKernel));
112-
113105
auto llvmIntPtrType = mlir::IntegerType::get(
114106
ctx, this->getTypeConverter()->getPointerBitwidth(0));
115107
auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
116-
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
117-
voidTy,
118-
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
119-
llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
120-
/*isVarArg=*/false);
121-
122-
auto cufLaunchKernel = mlir::SymbolRefAttr::get(
123-
mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
124-
if (!funcOp) {
125-
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
126-
rewriter.setInsertionPointToStart(mod.getBody());
127-
auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
128-
loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
129-
launchKernelFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private);
130-
}
131108

132109
mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy);
133110

134-
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
135-
op, funcTy, cufLaunchKernel,
136-
mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
137-
adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
138-
adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
139-
adaptor.getBlockSizeZ(), dynamicMemorySize, kernelArgs,
140-
nullPtr});
111+
if (op.hasClusterSize()) {
112+
auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
113+
RTNAME_STRING(CUFLaunchClusterKernel));
114+
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
115+
voidTy,
116+
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
117+
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
118+
llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
119+
/*isVarArg=*/false);
120+
auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
121+
mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
122+
if (!funcOp) {
123+
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
124+
rewriter.setInsertionPointToStart(mod.getBody());
125+
auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
126+
loc, RTNAME_STRING(CUFLaunchClusterKernel), funcTy);
127+
launchKernelFuncOp.setVisibility(
128+
mlir::SymbolTable::Visibility::Private);
129+
}
130+
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
131+
op, funcTy, cufLaunchClusterKernel,
132+
mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
133+
adaptor.getClusterSizeY(), adaptor.getClusterSizeZ(),
134+
adaptor.getGridSizeX(), adaptor.getGridSizeY(),
135+
adaptor.getGridSizeZ(), adaptor.getBlockSizeX(),
136+
adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
137+
dynamicMemorySize, kernelArgs, nullPtr});
138+
} else {
139+
auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
140+
RTNAME_STRING(CUFLaunchKernel));
141+
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
142+
voidTy,
143+
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
144+
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
145+
/*isVarArg=*/false);
146+
auto cufLaunchKernel = mlir::SymbolRefAttr::get(
147+
mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
148+
if (!funcOp) {
149+
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
150+
rewriter.setInsertionPointToStart(mod.getBody());
151+
auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
152+
loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
153+
launchKernelFuncOp.setVisibility(
154+
mlir::SymbolTable::Visibility::Private);
155+
}
156+
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
157+
op, funcTy, cufLaunchKernel,
158+
mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
159+
adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
160+
adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
161+
adaptor.getBlockSizeZ(), dynamicMemorySize,
162+
kernelArgs, nullPtr});
163+
}
141164

142165
return mlir::success();
143166
}

flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: fir-opt --cuf-gpu-convert-to-llvm %s | FileCheck %s
1+
// RUN: fir-opt --split-input-file --cuf-gpu-convert-to-llvm %s | FileCheck %s
22

33
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
44
llvm.func @_QMmod1Phost_sub() {
@@ -102,3 +102,25 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
102102

103103
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1 : !llvm.ptr
104104
// CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], {{.*}})
105+
106+
// -----
107+
108+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
109+
llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
110+
llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
111+
llvm.return
112+
}
113+
llvm.func @_QQmain() attributes {fir.bindc_name = "test"} {
114+
%0 = llvm.mlir.constant(1 : index) : i64
115+
%1 = llvm.mlir.constant(2 : index) : i64
116+
%2 = llvm.mlir.constant(0 : i32) : i32
117+
%3 = llvm.mlir.constant(10 : index) : i64
118+
gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2
119+
llvm.return
120+
}
121+
gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
122+
}
123+
124+
// CHECK-LABEL: llvm.func @_QQmain()
125+
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
126+
// CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})

0 commit comments

Comments
 (0)