[NVPTX] Convert scalar function nvvm.annotations to attributes #125908

AlexMaclean · 2025-02-05T18:47:49Z

Replace some more nvvm.annotations with function attributes, auto-upgrading the annotations as needed. These new attributes will be more idiomatic and compile-time efficient than the annotations.

!"maxclusterrank" / !"cluster_max_blocks" -> "nvvm.maxclusterrank"
!"minctasm" -> "nvvm.minctasm"
!"maxnreg" -> "nvvm.maxnreg"

llvmbot · 2025-02-05T18:48:23Z

@llvm/pr-subscribers-flang-openmp
@llvm/pr-subscribers-clang-codegen
@llvm/pr-subscribers-mlir-llvm
@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-mlir

Author: Alex MacLean (AlexMaclean)

Changes

Replace some more nvvm.annotations with function attributes, auto-upgrading the annotations as needed. These new attributes will be more idiomatic and compile-time efficient than the annotations.

!"maxclusterrank" / !"cluster_max_blocks" -> "nvvm.maxclusterrank"
!"minctasm" -> "nvvm.minctasm"
!"maxnreg" -> "nvvm.maxnreg"

Patch is 24.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125908.diff

13 Files Affected:

(modified) clang/lib/CodeGen/Targets/NVPTX.cpp (+5-10)
(modified) clang/test/CodeGenCUDA/launch-bounds.cu (+19-13)
(modified) llvm/docs/NVPTXUsage.rst (+23-14)
(modified) llvm/lib/IR/AutoUpgrade.cpp (+16)
(modified) llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp (+2-7)
(modified) llvm/lib/Target/NVPTX/NVPTXUtilities.cpp (+10-3)
(modified) llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll (+2-2)
(modified) llvm/test/CodeGen/NVPTX/annotations.ll (+3-9)
(modified) llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll (+9-7)
(modified) llvm/test/CodeGen/NVPTX/maxclusterrank.ll (+3-5)
(modified) llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll (+52-12)
(modified) mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp (+4-3)
(modified) mlir/test/Target/LLVMIR/nvvmir.mlir (+9-12)

diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index b82e4ddb9f3f2b..f89d32d4e13fe9 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -375,11 +375,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
     if (MinBlocks > 0) {
       if (MinBlocksVal)
         *MinBlocksVal = MinBlocks.getExtValue();
-      if (F) {
-        // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
-        NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
-                                                MinBlocks.getExtValue());
-      }
+      if (F)
+        F->addFnAttr("nvvm.minctasm", llvm::utostr(MinBlocks.getExtValue()));
     }
   }
   if (Attr->getMaxBlocks()) {
@@ -388,11 +385,9 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
     if (MaxBlocks > 0) {
       if (MaxClusterRankVal)
         *MaxClusterRankVal = MaxBlocks.getExtValue();
-      if (F) {
-        // Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
-        NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
-                                                MaxBlocks.getExtValue());
-      }
+      if (F)
+        F->addFnAttr("nvvm.maxclusterrank",
+                     llvm::utostr(MaxBlocks.getExtValue()));
     }
   }
 }
diff --git a/clang/test/CodeGenCUDA/launch-bounds.cu b/clang/test/CodeGenCUDA/launch-bounds.cu
index 31ca9216b413e9..72f7857264f8cf 100644
--- a/clang/test/CodeGenCUDA/launch-bounds.cu
+++ b/clang/test/CodeGenCUDA/launch-bounds.cu
@@ -9,6 +9,25 @@
 #define MAX_BLOCKS_PER_MP     4
 #endif
 
+// CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
+// CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
+// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
+// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
+// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]
+
+// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
+// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
+// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}
+
+// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]
+
+// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
+// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
+// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
+
 // Test both max threads per block and Min cta per sm.
 extern "C" {
 __global__ void
@@ -19,7 +38,6 @@ Kernel1()
 }
 
 // CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
-// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"minctasm", i32 2}
 
 #ifdef USE_MAX_BLOCKS
 // Test max threads per block and min/max cta per sm.
@@ -32,8 +50,6 @@ Kernel1_sm_90()
 }
 
 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"minctasm", i32 2}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxclusterrank", i32 4}
 #endif // USE_MAX_BLOCKS
 
 // Test only max threads per block. Min cta per sm defaults to 0, and
@@ -67,7 +83,6 @@ Kernel4()
 template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
 
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"minctasm", i32 2}
 
 #ifdef USE_MAX_BLOCKS
 template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -79,8 +94,6 @@ Kernel4_sm_90()
 template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
 
 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"minctasm", i32 2}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxclusterrank", i32 4}
 #endif //USE_MAX_BLOCKS
 
 const int constint = 100;
@@ -94,7 +107,6 @@ Kernel5()
 template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
 
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"minctasm", i32 258}
 
 #ifdef USE_MAX_BLOCKS
 
@@ -109,8 +121,6 @@ Kernel5_sm_90()
 template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
 
 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"minctasm", i32 258}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxclusterrank", i32 260}
 #endif //USE_MAX_BLOCKS
 
 // Make sure we don't emit negative launch bounds values.
@@ -120,7 +130,6 @@ Kernel6()
 {
 }
 // CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",
-// CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"minctasm",
 
 __global__ void
 __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
@@ -144,12 +153,9 @@ Kernel7_sm_90()
 const char constchar = 12;
 __global__ void __launch_bounds__(constint, constchar) Kernel8() {}
 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
-// CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"minctasm", i32 12
 
 #ifdef USE_MAX_BLOCKS
 const char constchar_2 = 14;
 __global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
 // CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"minctasm", i32 12
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxclusterrank", i32 14
 #endif // USE_MAX_BLOCKS
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 64dd2b84a1763e..97844be05e0316 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -33,17 +33,12 @@ Marking Functions as Kernels
 
 In PTX, there are two types of functions: *device functions*, which are only
 callable by device code, and *kernel functions*, which are callable by host
-code. By default, the back-end will emit device functions. Metadata is used to
-declare a function as a kernel function. This metadata is attached to the
-``nvvm.annotations`` named metadata object, and has the following format:
+code. By default, the back-end will emit device functions. The ``ptx_kernel``
+calling convention is used to declare a function as a kernel function.
 
-.. code-block:: text
-
-   !0 = !{<function-ref>, metadata !"kernel", i32 1}
-
-The first parameter is a reference to the kernel function. The following
-example shows a kernel function calling a device function in LLVM IR. The
-function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is not.
+The following example shows a kernel function calling a device function in LLVM
+IR. The function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is
+not.
 
 .. code-block:: llvm
 
@@ -53,18 +48,32 @@ function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is not.
       ret float %add
     }
 
-    define void @my_kernel(ptr %ptr) {
+    define ptx_kernel void @my_kernel(ptr %ptr) {
       %val = load float, ptr %ptr
       %ret = call float @my_fmad(float %val, float %val, float %val)
       store float %ret, ptr %ptr
       ret void
     }
 
-    !nvvm.annotations = !{!1}
-    !1 = !{ptr @my_kernel, !"kernel", i32 1}
-
 When compiled, the PTX kernel functions are callable by host-side code.
 
+.. _fnattrs:
+
+Function Attributes
+-------------------
+
+``"nvvm.maxclusterrank"="<n>"``
+    This attribute specifies the maximum number of blocks per cluster. Must be 
+    non-zero. Only supported for Hopper+.
+
+``"nvvm.minctasm"="<n>"``
+    This indicates a hint/directive to the compiler/driver, asking it to put at
+    least these many CTAs on an SM.
+
+``"nvvm.maxnreg"="<n>"``
+    This attribute indicates the maximum number of registers to be used for the
+    kernel function.
+
 
 .. _address_spaces:
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index e886a6012b219a..57072715366c9c 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/AutoUpgrade.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -5043,6 +5044,21 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
         Idx, Attribute::getWithStackAlignment(GV->getContext(), StackAlign));
     return true;
   }
+  if (K == "maxclusterrank" || K == "cluster_max_blocks") {
+    const auto CV = mdconst::extract<ConstantInt>(V)->getZExtValue();
+    cast<Function>(GV)->addFnAttr("nvvm.maxclusterrank", llvm::utostr(CV));
+    return true;
+  }
+  if (K == "minctasm") {
+    const auto CV = mdconst::extract<ConstantInt>(V)->getZExtValue();
+    cast<Function>(GV)->addFnAttr("nvvm.minctasm", llvm::utostr(CV));
+    return true;
+  }
+  if (K == "maxnreg") {
+    const auto CV = mdconst::extract<ConstantInt>(V)->getZExtValue();
+    cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV));
+    return true;
+  }
 
   return false;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
index c03ef8d33220c1..ae5922cba4ce3b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
@@ -70,18 +70,13 @@ static void addKernelMetadata(Module &M, Function *F) {
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
 
-  llvm::Metadata *BlockMDVals[] = {
-      llvm::ConstantAsMetadata::get(F),
-      llvm::MDString::get(Ctx, "maxclusterrank"),
-      llvm::ConstantAsMetadata::get(
-          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
+  F->addFnAttr("nvvm.maxclusterrank", "1");
+  F->setCallingConv(CallingConv::PTX_Kernel);
 
   // Append metadata to nvvm.annotations.
-  F->setCallingConv(CallingConv::PTX_Kernel);
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals));
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals));
-  MD->addOperand(llvm::MDNode::get(Ctx, BlockMDVals));
 }
 
 static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index a41943880807c5..187b8905750129 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -179,6 +179,13 @@ static bool argHasNVVMAnnotation(const Value &Val,
   return false;
 }
 
+static std::optional<unsigned> getFnAttrParsedIntOrNull(const Function &F,
+                                                        StringRef Attr) {
+  if (F.hasFnAttribute(Attr))
+    return F.getFnAttributeAsParsedInteger(Attr);
+  return std::nullopt;
+}
+
 bool isParamGridConstant(const Value &V) {
   if (const Argument *Arg = dyn_cast<Argument>(&V)) {
     // "grid_constant" counts argument indices starting from 1
@@ -277,7 +284,7 @@ std::optional<unsigned> getClusterDimz(const Function &F) {
 }
 
 std::optional<unsigned> getMaxClusterRank(const Function &F) {
-  return findOneNVVMAnnotation(&F, "maxclusterrank");
+  return getFnAttrParsedIntOrNull(F, "nvvm.maxclusterrank");
 }
 
 std::optional<unsigned> getReqNTIDx(const Function &F) {
@@ -303,11 +310,11 @@ std::optional<unsigned> getReqNTID(const Function &F) {
 }
 
 std::optional<unsigned> getMinCTASm(const Function &F) {
-  return findOneNVVMAnnotation(&F, "minctasm");
+  return getFnAttrParsedIntOrNull(F, "nvvm.minctasm");
 }
 
 std::optional<unsigned> getMaxNReg(const Function &F) {
-  return findOneNVVMAnnotation(&F, "maxnreg");
+  return getFnAttrParsedIntOrNull(F, "nvvm.maxnreg");
 }
 
 MaybeAlign getAlign(const Function &F, unsigned Index) {
diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
index 7a055c7152ec85..a0c06083c270bc 100644
--- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
@@ -23,11 +23,12 @@ entry:
 attributes #0 = {
   "omp_target_num_teams"="100"
   "omp_target_thread_limit"="101"
+  "nvvm.maxclusterrank"="200"
 }
 
 !llvm.module.flags = !{!0}
 !llvm.dbg.cu = !{!1}
-!nvvm.annotations = !{!6, !7, !8, !9, !10}
+!nvvm.annotations = !{!7, !8, !9, !10}
 
 !0 = !{i32 2, !"Debug Info Version", i32 3}
 !1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
@@ -35,7 +36,6 @@ attributes #0 = {
 !3 = !{}
 !4 = !DISubroutineType(types: !3)
 !5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
-!6 = !{ptr @test, !"maxclusterrank", i32 200}
 !7 = !{ptr @test, !"maxntidx", i32 210}
 !8 = !{ptr @test, !"maxntidy", i32 211}
 !9 = !{ptr @test, !"maxntidz", i32 212}
diff --git a/llvm/test/CodeGen/NVPTX/annotations.ll b/llvm/test/CodeGen/NVPTX/annotations.ll
index 3bd534bb0cf5d2..1f888d7fb21f1e 100644
--- a/llvm/test/CodeGen/NVPTX/annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/annotations.ll
@@ -23,20 +23,20 @@ define void @kernel_func_reqntid(ptr %a) {
 }
 
 ; CHECK: .entry kernel_func_minctasm
-define void @kernel_func_minctasm(ptr %a) {
+define ptx_kernel void @kernel_func_minctasm(ptr %a) "nvvm.minctasm"="42" {
 ; CHECK: .minnctapersm 42
 ; CHECK: ret
   ret void
 }
 
 ; CHECK-LABEL: .entry kernel_func_maxnreg
-define void @kernel_func_maxnreg() {
+define ptx_kernel void @kernel_func_maxnreg() "nvvm.maxnreg"="1234" {
 ; CHECK: .maxnreg 1234
 ; CHECK: ret
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3, !4, !5, !6, !7, !8, !9, !10}
+!nvvm.annotations = !{!1, !2, !3, !4, !9, !10}
 
 !1 = !{ptr @kernel_func_maxntid, !"kernel", i32 1}
 !2 = !{ptr @kernel_func_maxntid, !"maxntidx", i32 10, !"maxntidy", i32 20, !"maxntidz", i32 30}
@@ -44,11 +44,5 @@ define void @kernel_func_maxnreg() {
 !3 = !{ptr @kernel_func_reqntid, !"kernel", i32 1}
 !4 = !{ptr @kernel_func_reqntid, !"reqntidx", i32 11, !"reqntidy", i32 22, !"reqntidz", i32 33}
 
-!5 = !{ptr @kernel_func_minctasm, !"kernel", i32 1}
-!6 = !{ptr @kernel_func_minctasm, !"minctasm", i32 42}
-
-!7 = !{ptr @kernel_func_maxnreg, !"kernel", i32 1}
-!8 = !{ptr @kernel_func_maxnreg, !"maxnreg", i32 1234}
-
 !9 = !{ptr addrspace(1) @texture, !"texture", i32 1}
 !10 = !{ptr addrspace(1) @surface, !"surface", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
index 4ee1ca3ad4b1f0..71daa8ccef2f05 100644
--- a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
@@ -43,7 +43,8 @@ define internal void @bar() {
   ret void
 }
 
-; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$init"() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$init"
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_start, align 8
 ; CHECK-NEXT:    [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_end, align 8
@@ -60,7 +61,8 @@ define internal void @bar() {
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$fini"() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$fini"
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_start, align 8
 ; CHECK-NEXT:    [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_end, align 8
@@ -82,11 +84,11 @@ define internal void @bar() {
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 
+; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" }
+
 ; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1}
 ; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1}
 ; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxclusterrank", i32 1}
-; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
-; CHECK: [[META7:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
-; CHECK: [[META8:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
-; CHECK: [[META9:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxclusterrank", i32 1}
+; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
+; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
+; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
index c445c34c1842a5..51483296dd34fe 100644
--- a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
+++ b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
@@ -10,16 +10,14 @@ target triple = "nvptx64-unknown-unknown"
 ; CHECK_SM_80-NOT: .maxclusterrank 8
 
 ; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
-; sielently ignored.
-define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() {
+; silently ignored.
+define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" {
 entry:
   %a = alloca i32, align 4
   store volatile i32 1, ptr %a, align 4
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3}
+!nvvm.annotations = !{!1}
 
 !1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}
-!2 = !{ptr @_Z18TestMaxClusterRankv, !"minctasm", i32 2}
-!3 = !{ptr @_Z18TestMaxClusterRankv, !"maxclusterrank", i32 8}
diff --git a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
index a9f370a12a945a..3a1f59454493cb 100644
--- a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
@@ -1,28 +1,68 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
-; RUN: opt < %s -mtriple=nvptx64-unknown-unknown -O0 -S | FileCheck %s
+; RUN: opt < %s -passes=verify -S | FileCheck %s
 
-define i32 @foo(i32 %a, i32 %b) {
-; CHECK-LABEL: define i32 @foo(
+define i32 @test_align(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @test_align(
 ; CHECK-SAME: i32 alignstack(8) [[A:%.*]], i32 alignstack(16) [[B:%.*]]) {
 ; CHECK-NEXT:    ret i32 0
 ;
   ret i32 0
 }
 
-define i32 @bar(i32 %a, i32 %b) {
-; CHECK-LABEL: define ptx_kernel i32 @bar(
-; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; CHECK-NEXT:    ret i32 0
+define void @test_kernel() {
+; CHECK-LABEL: define ptx_kernel void @test_kernel() {
+; CHECK-NEXT:    ret void
 ;
-  ret i32 0
+  ret void
+}
+
+define void @test_maxclusterrank() {
+; CHECK-LABEL: define void @test_maxclusterrank(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
 }
 
-!nvvm.annotations = !{!0, !1, !2}
+define void @test_cluster_max_blocks() {
+; CHECK-LABEL: define void @test_cluster_max_blocks(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-!0 = !{ptr @foo, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010}
+define void @test_minctasm() {
+; CHECK-LABEL: define void @test_minctasm(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_maxnreg() {
+; CHECK-LABEL: define void @test_maxnreg(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6}
+
+!0 = !{ptr @test_align, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"...
[truncated]

llvmbot · 2025-02-05T18:48:23Z

@llvm/pr-subscribers-clang

Author: Alex MacLean (AlexMaclean)

Changes

Replace some more nvvm.annotations with function attributes, auto-upgrading the annotations as needed. These new attributes will be more idiomatic and compile-time efficient than the annotations.

!"maxclusterrank" / !"cluster_max_blocks" -> "nvvm.maxclusterrank"
!"minctasm" -> "nvvm.minctasm"
!"maxnreg" -> "nvvm.maxnreg"

Patch is 24.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125908.diff

13 Files Affected:

(modified) clang/lib/CodeGen/Targets/NVPTX.cpp (+5-10)
(modified) clang/test/CodeGenCUDA/launch-bounds.cu (+19-13)
(modified) llvm/docs/NVPTXUsage.rst (+23-14)
(modified) llvm/lib/IR/AutoUpgrade.cpp (+16)
(modified) llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp (+2-7)
(modified) llvm/lib/Target/NVPTX/NVPTXUtilities.cpp (+10-3)
(modified) llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll (+2-2)
(modified) llvm/test/CodeGen/NVPTX/annotations.ll (+3-9)
(modified) llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll (+9-7)
(modified) llvm/test/CodeGen/NVPTX/maxclusterrank.ll (+3-5)
(modified) llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll (+52-12)
(modified) mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp (+4-3)
(modified) mlir/test/Target/LLVMIR/nvvmir.mlir (+9-12)

diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index b82e4ddb9f3f2b..f89d32d4e13fe9 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -375,11 +375,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
     if (MinBlocks > 0) {
       if (MinBlocksVal)
         *MinBlocksVal = MinBlocks.getExtValue();
-      if (F) {
-        // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
-        NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
-                                                MinBlocks.getExtValue());
-      }
+      if (F)
+        F->addFnAttr("nvvm.minctasm", llvm::utostr(MinBlocks.getExtValue()));
     }
   }
   if (Attr->getMaxBlocks()) {
@@ -388,11 +385,9 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
     if (MaxBlocks > 0) {
       if (MaxClusterRankVal)
         *MaxClusterRankVal = MaxBlocks.getExtValue();
-      if (F) {
-        // Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
-        NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
-                                                MaxBlocks.getExtValue());
-      }
+      if (F)
+        F->addFnAttr("nvvm.maxclusterrank",
+                     llvm::utostr(MaxBlocks.getExtValue()));
     }
   }
 }
diff --git a/clang/test/CodeGenCUDA/launch-bounds.cu b/clang/test/CodeGenCUDA/launch-bounds.cu
index 31ca9216b413e9..72f7857264f8cf 100644
--- a/clang/test/CodeGenCUDA/launch-bounds.cu
+++ b/clang/test/CodeGenCUDA/launch-bounds.cu
@@ -9,6 +9,25 @@
 #define MAX_BLOCKS_PER_MP     4
 #endif
 
+// CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
+// CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
+// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
+// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
+// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]
+
+// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
+// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
+// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}
+
+// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]
+
+// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
+// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
+// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
+
 // Test both max threads per block and Min cta per sm.
 extern "C" {
 __global__ void
@@ -19,7 +38,6 @@ Kernel1()
 }
 
 // CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
-// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"minctasm", i32 2}
 
 #ifdef USE_MAX_BLOCKS
 // Test max threads per block and min/max cta per sm.
@@ -32,8 +50,6 @@ Kernel1_sm_90()
 }
 
 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"minctasm", i32 2}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxclusterrank", i32 4}
 #endif // USE_MAX_BLOCKS
 
 // Test only max threads per block. Min cta per sm defaults to 0, and
@@ -67,7 +83,6 @@ Kernel4()
 template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
 
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"minctasm", i32 2}
 
 #ifdef USE_MAX_BLOCKS
 template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -79,8 +94,6 @@ Kernel4_sm_90()
 template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
 
 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"minctasm", i32 2}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxclusterrank", i32 4}
 #endif //USE_MAX_BLOCKS
 
 const int constint = 100;
@@ -94,7 +107,6 @@ Kernel5()
 template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
 
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"minctasm", i32 258}
 
 #ifdef USE_MAX_BLOCKS
 
@@ -109,8 +121,6 @@ Kernel5_sm_90()
 template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
 
 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"minctasm", i32 258}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxclusterrank", i32 260}
 #endif //USE_MAX_BLOCKS
 
 // Make sure we don't emit negative launch bounds values.
@@ -120,7 +130,6 @@ Kernel6()
 {
 }
 // CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",
-// CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"minctasm",
 
 __global__ void
 __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
@@ -144,12 +153,9 @@ Kernel7_sm_90()
 const char constchar = 12;
 __global__ void __launch_bounds__(constint, constchar) Kernel8() {}
 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
-// CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"minctasm", i32 12
 
 #ifdef USE_MAX_BLOCKS
 const char constchar_2 = 14;
 __global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
 // CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"minctasm", i32 12
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxclusterrank", i32 14
 #endif // USE_MAX_BLOCKS
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 64dd2b84a1763e..97844be05e0316 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -33,17 +33,12 @@ Marking Functions as Kernels
 
 In PTX, there are two types of functions: *device functions*, which are only
 callable by device code, and *kernel functions*, which are callable by host
-code. By default, the back-end will emit device functions. Metadata is used to
-declare a function as a kernel function. This metadata is attached to the
-``nvvm.annotations`` named metadata object, and has the following format:
+code. By default, the back-end will emit device functions. The ``ptx_kernel``
+calling convention is used to declare a function as a kernel function.
 
-.. code-block:: text
-
-   !0 = !{<function-ref>, metadata !"kernel", i32 1}
-
-The first parameter is a reference to the kernel function. The following
-example shows a kernel function calling a device function in LLVM IR. The
-function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is not.
+The following example shows a kernel function calling a device function in LLVM
+IR. The function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is
+not.
 
 .. code-block:: llvm
 
@@ -53,18 +48,32 @@ function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is not.
       ret float %add
     }
 
-    define void @my_kernel(ptr %ptr) {
+    define ptx_kernel void @my_kernel(ptr %ptr) {
       %val = load float, ptr %ptr
       %ret = call float @my_fmad(float %val, float %val, float %val)
       store float %ret, ptr %ptr
       ret void
     }
 
-    !nvvm.annotations = !{!1}
-    !1 = !{ptr @my_kernel, !"kernel", i32 1}
-
 When compiled, the PTX kernel functions are callable by host-side code.
 
+.. _fnattrs:
+
+Function Attributes
+-------------------
+
+``"nvvm.maxclusterrank"="<n>"``
+    This attribute specifies the maximum number of blocks per cluster. Must be 
+    non-zero. Only supported for Hopper+.
+
+``"nvvm.minctasm"="<n>"``
+    This indicates a hint/directive to the compiler/driver, asking it to put at
+    least these many CTAs on an SM.
+
+``"nvvm.maxnreg"="<n>"``
+    This attribute indicates the maximum number of registers to be used for the
+    kernel function.
+
 
 .. _address_spaces:
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index e886a6012b219a..57072715366c9c 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/AutoUpgrade.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -5043,6 +5044,21 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
         Idx, Attribute::getWithStackAlignment(GV->getContext(), StackAlign));
     return true;
   }
+  if (K == "maxclusterrank" || K == "cluster_max_blocks") {
+    const auto CV = mdconst::extract<ConstantInt>(V)->getZExtValue();
+    cast<Function>(GV)->addFnAttr("nvvm.maxclusterrank", llvm::utostr(CV));
+    return true;
+  }
+  if (K == "minctasm") {
+    const auto CV = mdconst::extract<ConstantInt>(V)->getZExtValue();
+    cast<Function>(GV)->addFnAttr("nvvm.minctasm", llvm::utostr(CV));
+    return true;
+  }
+  if (K == "maxnreg") {
+    const auto CV = mdconst::extract<ConstantInt>(V)->getZExtValue();
+    cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV));
+    return true;
+  }
 
   return false;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
index c03ef8d33220c1..ae5922cba4ce3b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
@@ -70,18 +70,13 @@ static void addKernelMetadata(Module &M, Function *F) {
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
 
-  llvm::Metadata *BlockMDVals[] = {
-      llvm::ConstantAsMetadata::get(F),
-      llvm::MDString::get(Ctx, "maxclusterrank"),
-      llvm::ConstantAsMetadata::get(
-          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
+  F->addFnAttr("nvvm.maxclusterrank", "1");
+  F->setCallingConv(CallingConv::PTX_Kernel);
 
   // Append metadata to nvvm.annotations.
-  F->setCallingConv(CallingConv::PTX_Kernel);
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals));
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals));
-  MD->addOperand(llvm::MDNode::get(Ctx, BlockMDVals));
 }
 
 static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index a41943880807c5..187b8905750129 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -179,6 +179,13 @@ static bool argHasNVVMAnnotation(const Value &Val,
   return false;
 }
 
+static std::optional<unsigned> getFnAttrParsedIntOrNull(const Function &F,
+                                                        StringRef Attr) {
+  if (F.hasFnAttribute(Attr))
+    return F.getFnAttributeAsParsedInteger(Attr);
+  return std::nullopt;
+}
+
 bool isParamGridConstant(const Value &V) {
   if (const Argument *Arg = dyn_cast<Argument>(&V)) {
     // "grid_constant" counts argument indices starting from 1
@@ -277,7 +284,7 @@ std::optional<unsigned> getClusterDimz(const Function &F) {
 }
 
 std::optional<unsigned> getMaxClusterRank(const Function &F) {
-  return findOneNVVMAnnotation(&F, "maxclusterrank");
+  return getFnAttrParsedIntOrNull(F, "nvvm.maxclusterrank");
 }
 
 std::optional<unsigned> getReqNTIDx(const Function &F) {
@@ -303,11 +310,11 @@ std::optional<unsigned> getReqNTID(const Function &F) {
 }
 
 std::optional<unsigned> getMinCTASm(const Function &F) {
-  return findOneNVVMAnnotation(&F, "minctasm");
+  return getFnAttrParsedIntOrNull(F, "nvvm.minctasm");
 }
 
 std::optional<unsigned> getMaxNReg(const Function &F) {
-  return findOneNVVMAnnotation(&F, "maxnreg");
+  return getFnAttrParsedIntOrNull(F, "nvvm.maxnreg");
 }
 
 MaybeAlign getAlign(const Function &F, unsigned Index) {
diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
index 7a055c7152ec85..a0c06083c270bc 100644
--- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
@@ -23,11 +23,12 @@ entry:
 attributes #0 = {
   "omp_target_num_teams"="100"
   "omp_target_thread_limit"="101"
+  "nvvm.maxclusterrank"="200"
 }
 
 !llvm.module.flags = !{!0}
 !llvm.dbg.cu = !{!1}
-!nvvm.annotations = !{!6, !7, !8, !9, !10}
+!nvvm.annotations = !{!7, !8, !9, !10}
 
 !0 = !{i32 2, !"Debug Info Version", i32 3}
 !1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
@@ -35,7 +36,6 @@ attributes #0 = {
 !3 = !{}
 !4 = !DISubroutineType(types: !3)
 !5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
-!6 = !{ptr @test, !"maxclusterrank", i32 200}
 !7 = !{ptr @test, !"maxntidx", i32 210}
 !8 = !{ptr @test, !"maxntidy", i32 211}
 !9 = !{ptr @test, !"maxntidz", i32 212}
diff --git a/llvm/test/CodeGen/NVPTX/annotations.ll b/llvm/test/CodeGen/NVPTX/annotations.ll
index 3bd534bb0cf5d2..1f888d7fb21f1e 100644
--- a/llvm/test/CodeGen/NVPTX/annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/annotations.ll
@@ -23,20 +23,20 @@ define void @kernel_func_reqntid(ptr %a) {
 }
 
 ; CHECK: .entry kernel_func_minctasm
-define void @kernel_func_minctasm(ptr %a) {
+define ptx_kernel void @kernel_func_minctasm(ptr %a) "nvvm.minctasm"="42" {
 ; CHECK: .minnctapersm 42
 ; CHECK: ret
   ret void
 }
 
 ; CHECK-LABEL: .entry kernel_func_maxnreg
-define void @kernel_func_maxnreg() {
+define ptx_kernel void @kernel_func_maxnreg() "nvvm.maxnreg"="1234" {
 ; CHECK: .maxnreg 1234
 ; CHECK: ret
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3, !4, !5, !6, !7, !8, !9, !10}
+!nvvm.annotations = !{!1, !2, !3, !4, !9, !10}
 
 !1 = !{ptr @kernel_func_maxntid, !"kernel", i32 1}
 !2 = !{ptr @kernel_func_maxntid, !"maxntidx", i32 10, !"maxntidy", i32 20, !"maxntidz", i32 30}
@@ -44,11 +44,5 @@ define void @kernel_func_maxnreg() {
 !3 = !{ptr @kernel_func_reqntid, !"kernel", i32 1}
 !4 = !{ptr @kernel_func_reqntid, !"reqntidx", i32 11, !"reqntidy", i32 22, !"reqntidz", i32 33}
 
-!5 = !{ptr @kernel_func_minctasm, !"kernel", i32 1}
-!6 = !{ptr @kernel_func_minctasm, !"minctasm", i32 42}
-
-!7 = !{ptr @kernel_func_maxnreg, !"kernel", i32 1}
-!8 = !{ptr @kernel_func_maxnreg, !"maxnreg", i32 1234}
-
 !9 = !{ptr addrspace(1) @texture, !"texture", i32 1}
 !10 = !{ptr addrspace(1) @surface, !"surface", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
index 4ee1ca3ad4b1f0..71daa8ccef2f05 100644
--- a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
@@ -43,7 +43,8 @@ define internal void @bar() {
   ret void
 }
 
-; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$init"() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$init"
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_start, align 8
 ; CHECK-NEXT:    [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_end, align 8
@@ -60,7 +61,8 @@ define internal void @bar() {
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$fini"() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$fini"
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_start, align 8
 ; CHECK-NEXT:    [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_end, align 8
@@ -82,11 +84,11 @@ define internal void @bar() {
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 
+; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" }
+
 ; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1}
 ; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1}
 ; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxclusterrank", i32 1}
-; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
-; CHECK: [[META7:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
-; CHECK: [[META8:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
-; CHECK: [[META9:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxclusterrank", i32 1}
+; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
+; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
+; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
index c445c34c1842a5..51483296dd34fe 100644
--- a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
+++ b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
@@ -10,16 +10,14 @@ target triple = "nvptx64-unknown-unknown"
 ; CHECK_SM_80-NOT: .maxclusterrank 8
 
 ; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
-; sielently ignored.
-define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() {
+; silently ignored.
+define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" {
 entry:
   %a = alloca i32, align 4
   store volatile i32 1, ptr %a, align 4
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3}
+!nvvm.annotations = !{!1}
 
 !1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}
-!2 = !{ptr @_Z18TestMaxClusterRankv, !"minctasm", i32 2}
-!3 = !{ptr @_Z18TestMaxClusterRankv, !"maxclusterrank", i32 8}
diff --git a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
index a9f370a12a945a..3a1f59454493cb 100644
--- a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
@@ -1,28 +1,68 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
-; RUN: opt < %s -mtriple=nvptx64-unknown-unknown -O0 -S | FileCheck %s
+; RUN: opt < %s -passes=verify -S | FileCheck %s
 
-define i32 @foo(i32 %a, i32 %b) {
-; CHECK-LABEL: define i32 @foo(
+define i32 @test_align(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @test_align(
 ; CHECK-SAME: i32 alignstack(8) [[A:%.*]], i32 alignstack(16) [[B:%.*]]) {
 ; CHECK-NEXT:    ret i32 0
 ;
   ret i32 0
 }
 
-define i32 @bar(i32 %a, i32 %b) {
-; CHECK-LABEL: define ptx_kernel i32 @bar(
-; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; CHECK-NEXT:    ret i32 0
+define void @test_kernel() {
+; CHECK-LABEL: define ptx_kernel void @test_kernel() {
+; CHECK-NEXT:    ret void
 ;
-  ret i32 0
+  ret void
+}
+
+define void @test_maxclusterrank() {
+; CHECK-LABEL: define void @test_maxclusterrank(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
 }
 
-!nvvm.annotations = !{!0, !1, !2}
+define void @test_cluster_max_blocks() {
+; CHECK-LABEL: define void @test_cluster_max_blocks(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-!0 = !{ptr @foo, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010}
+define void @test_minctasm() {
+; CHECK-LABEL: define void @test_minctasm(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_maxnreg() {
+; CHECK-LABEL: define void @test_maxnreg(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6}
+
+!0 = !{ptr @test_align, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"...
[truncated]

github-actions · 2025-02-05T18:51:11Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Artem-B · 2025-02-05T19:11:00Z

llvm/lib/Target/NVPTX/NVPTXUtilities.cpp

@@ -179,6 +179,13 @@ static bool argHasNVVMAnnotation(const Value &Val,
  return false;
 }

+static std::optional<unsigned> getFnAttrParsedIntOrNull(const Function &F,


Nit: OrNull is kind of implied by return type being optional.

Artem-B · 2025-02-05T19:12:44Z

llvm/lib/Target/NVPTX/NVPTXUtilities.cpp

+  if (F.hasFnAttribute(Attr))
+    return F.getFnAttributeAsParsedInteger(Attr);
+  return std::nullopt;


Nit: Could be just return F.hasFnAttribute(Attr) ? F.getFnAttributeAsParsedInteger(Attr) : {};

Had to be a little more explicit to make the compiler happy but I've switched to a ternary as requested.

Artem-B · 2025-02-06T01:07:47Z

llvm/lib/Target/NVPTX/NVPTXUtilities.cpp

+  return F.hasFnAttribute(Attr)
+             ? std::optional(F.getFnAttributeAsParsedInteger(Attr))
+             : std::nullopt;


Ugh. {} can't be used in a ternary, and using std::nullopt forces explicit optional use in the other branch. :-/
That ended up being a wash. Sorry about the noise.

Just in case. Comments marked as "nit" are up to you. It includes ignoring them or pushing back when those suggestions don't make sense or turn out not being worth it.

No worries! I agree it is basically a wash and will leave it as it currently is.

jholewinski · 2025-02-10T16:27:40Z

clang/lib/CodeGen/Targets/NVPTX.cpp

-                                                MinBlocks.getExtValue());
-      }
+      if (F)
+        F->addFnAttr("nvvm.minctasm", llvm::utostr(MinBlocks.getExtValue()));


We should eventually create a list of strings for valid nvvm. function attributes and use them here instead of hard-coding strings. It would serve as a single-source-of-truth for the set of valid attributes. Not necessary for this PR, but something to consider for the future.

grypp · 2025-02-12T07:31:46Z

mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp

@@ -227,14 +228,14 @@ class NVVMDialectLLVMIRTranslationInterface
    } else if (attribute.getName() ==


What about the other attributes above? Do we plan to change them as well? If yes, it'd be better to do everything at once.

Yes, I plan to replace all !nvvm.annotations with attributes. This change is already fairly large and I would prefer to avoid a single monolithic PR to make debugging any issues easier and to prevent unnecessary churn if it needs to be reverted. Would it be alright to address these now and the others in separate follow ups?

Sure sounds great

…125908) Replace some more nvvm.annotations with function attributes, auto-upgrading the annotations as needed. These new attributes will be more idiomatic and compile-time efficient than the annotations. - !"maxclusterrank" / !"cluster_max_blocks" -> "nvvm.maxclusterrank" - !"minctasm" -> "nvvm.minctasm" - !"maxnreg" -> "nvvm.maxnreg"

AlexMaclean requested review from jholewinski, Artem-B and justinfargnoli February 5, 2025 18:47

AlexMaclean self-assigned this Feb 5, 2025

AlexMaclean requested a review from grypp as a code owner February 5, 2025 18:47

llvmbot added clang Clang issues not falling into any other category clang:codegen IR generation bugs: mangling, exceptions, etc. mlir:llvm mlir backend:NVPTX llvm:ir llvm:analysis Includes value tracking, cost tables and constant folding labels Feb 5, 2025

AlexMaclean force-pushed the dev/amaclean/upstream/more-auto-upgrade branch from 8dd9f3b to d66d8ad Compare February 5, 2025 18:56

Artem-B approved these changes Feb 5, 2025

View reviewed changes

AlexMaclean force-pushed the dev/amaclean/upstream/more-auto-upgrade branch from d66d8ad to 12bdf8b Compare February 5, 2025 23:40

llvmbot added flang:openmp clang:openmp OpenMP related changes to Clang labels Feb 5, 2025

Artem-B reviewed Feb 6, 2025

View reviewed changes

jholewinski reviewed Feb 10, 2025

View reviewed changes

jholewinski approved these changes Feb 10, 2025

View reviewed changes

AlexMaclean added 2 commits February 11, 2025 21:28

[NVPTX] Convert scalar function nvvm.annotations to attributes

cb6ac07

address comments

503b5da

AlexMaclean force-pushed the dev/amaclean/upstream/more-auto-upgrade branch from 717a01d to 503b5da Compare February 11, 2025 21:28

grypp reviewed Feb 12, 2025

View reviewed changes

AlexMaclean merged commit a282b6c into llvm:main Feb 12, 2025
9 checks passed

maleadt mentioned this pull request Feb 5, 2025

PTX: Switch from NVVM kernel annotation to ptx_kernel calling convention and function attributes JuliaGPU/GPUCompiler.jl#657

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[NVPTX] Convert scalar function nvvm.annotations to attributes #125908

[NVPTX] Convert scalar function nvvm.annotations to attributes #125908

Uh oh!

AlexMaclean commented Feb 5, 2025

Uh oh!

llvmbot commented Feb 5, 2025 •

edited

Loading

Uh oh!

llvmbot commented Feb 5, 2025

Uh oh!

github-actions bot commented Feb 5, 2025 •

edited

Loading

Uh oh!

Artem-B Feb 5, 2025

Uh oh!

AlexMaclean Feb 5, 2025

Uh oh!

Artem-B Feb 5, 2025

Uh oh!

AlexMaclean Feb 5, 2025

Uh oh!

Artem-B Feb 6, 2025

Uh oh!

AlexMaclean Feb 7, 2025

Uh oh!

jholewinski Feb 10, 2025

Uh oh!

grypp Feb 12, 2025

Uh oh!

AlexMaclean Feb 12, 2025

Uh oh!

grypp Feb 12, 2025

Uh oh!

Uh oh!

Uh oh!

		@@ -227,14 +228,14 @@ class NVVMDialectLLVMIRTranslationInterface
		} else if (attribute.getName() ==

[NVPTX] Convert scalar function nvvm.annotations to attributes #125908

[NVPTX] Convert scalar function nvvm.annotations to attributes #125908

Uh oh!

Conversation

AlexMaclean commented Feb 5, 2025

Uh oh!

llvmbot commented Feb 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Feb 5, 2025

Uh oh!

github-actions bot commented Feb 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Feb 5, 2025 •

edited

Loading

github-actions bot commented Feb 5, 2025 •

edited

Loading