AMDGPU/PromoteAlloca: Fix handling of users of multiple allocas (#172771)

macurtis-amd · web-flow · commit e741cd88a10b · 2025-12-18T08:41:21.000-06:00
With recent refactoring, LDS promotion worklists for all allocas are
populated upfront. In some cases, this results in a User in multiple
lists. Then as each list is processed, a User might get deleted via
removeFromParent, potentially leaving a dangling pointer in a subsequent
worklist.

Currently this only occurs for memcpy and memmove. Prior to refactoring,
these were handled by DeferredInstr, and were processed after the last
use of the then singular worklist.

This change moves processing of DeferredInstr to after all worklists
have be processed.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -159,7 +159,10 @@ class AMDGPUPromoteAllocaImpl {
   void analyzePromoteToVector(AllocaAnalysis &AA) const;
   void promoteAllocaToVector(AllocaAnalysis &AA);
   void analyzePromoteToLDS(AllocaAnalysis &AA) const;
-  bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS);
+  bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
+                             SetVector<IntrinsicInst *> &DeferredIntrs);
+  void
+  finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
 
   void scoreAlloca(AllocaAnalysis &AA) const;
 
@@ -414,6 +417,7 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   // clang-format on
 
   bool Changed = false;
+  SetVector<IntrinsicInst *> DeferredIntrs;
   for (AllocaAnalysis &AA : Allocas) {
     if (AA.Vector.Ty) {
       const unsigned AllocaCost =
@@ -435,9 +439,11 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
       }
     }
 
-    if (AA.LDS.Enable && tryPromoteAllocaToLDS(AA, SufficientLDS))
+    if (AA.LDS.Enable &&
+        tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
       Changed = true;
   }
+  finishDeferredAllocaToLDSPromotion(DeferredIntrs);
 
   // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
   // dangling pointers. If we want to reuse it past this point, the loop above
@@ -1550,8 +1556,9 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
 }
 
 // FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
-                                                    bool SufficientLDS) {
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
+    AllocaAnalysis &AA, bool SufficientLDS,
+    SetVector<IntrinsicInst *> &DeferredIntrs) {
   LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
 
   // Not likely to have sufficient local memory for promotion.
@@ -1620,8 +1627,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
   AA.Alloca->replaceAllUsesWith(Offset);
   AA.Alloca->eraseFromParent();
 
-  SmallVector<IntrinsicInst *> DeferredIntrs;
-
   PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
 
   for (Value *V : AA.LDS.Worklist) {
@@ -1682,7 +1687,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
       // These have 2 pointer operands. In case if second pointer also needs
       // to be replaced we defer processing of these intrinsics until all
       // other values are processed.
-      DeferredIntrs.push_back(Intr);
+      DeferredIntrs.insert(Intr);
       continue;
     case Intrinsic::memset: {
       MemSetInst *MemSet = cast<MemSetInst>(Intr);
@@ -1730,7 +1735,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
     }
   }
 
+  return true;
+}
+
+void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
+    SetVector<IntrinsicInst *> &DeferredIntrs) {
+
   for (IntrinsicInst *Intr : DeferredIntrs) {
+    IRBuilder<> Builder(Intr);
     Builder.SetInsertPoint(Intr);
     Intrinsic::ID ID = Intr->getIntrinsicID();
     assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
@@ -1748,6 +1760,4 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
 
     Intr->eraseFromParent();
   }
-
-  return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-user-mult.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-user-mult.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; This tests the case where a memcpy has two pointer operands are promoted to LDS
+; See `@llvm.memcpy.p5.p5.i64(... %alloca1, ... %alloca, ...)` below.
+
+
+%struct.barney = type { i8, double }
+
+; Function Attrs: nofree norecurse noreturn nounwind memory(readwrite, target_mem0: none, target_mem1: none)
+define amdgpu_kernel void @zot() local_unnamed_addr #0 {
+; CHECK-LABEL: @zot(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x [[STRUCT_BARNEY:%.*]]], ptr addrspace(3) @zot.alloca, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP15]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(4) [[TMP16]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP15]], i64 2
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(4) [[TMP18]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP17]], 16
+; CHECK-NEXT:    [[TMP21:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP22:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP23:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw nsw i32 [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul i32 [[TMP24]], [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul nuw nsw i32 [[TMP22]], [[TMP19]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], [[TMP23]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [1024 x [[STRUCT_BARNEY]]], ptr addrspace(3) @zot.alloca1, i32 0, i32 [[TMP28]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) null, align 2147483648
+; CHECK-NEXT:    call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 16 dereferenceable(16) [[TMP29]], ptr addrspace(3) align 16 dereferenceable(16) [[TMP14]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 16 dereferenceable(16) [[TMP14]], ptr align 1 dereferenceable(16) poison, i64 16, i1 false)
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile ptr, ptr addrspace(5) null, align 2147483648
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p3.i64(ptr align 1 dereferenceable(16) @hoge, ptr addrspace(3) align 16 dereferenceable(16) [[TMP29]], i64 16, i1 false)
+; CHECK-NEXT:    br label [[BB2]]
+;
+bb:
+  %alloca = alloca %struct.barney, align 16, addrspace(5)
+  %alloca1 = alloca %struct.barney, align 16, addrspace(5)
+  store i32 0, ptr addrspace(5) null, align 2147483648
+  call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca1, ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca, i64 16, i1 false)
+  call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca, ptr noundef nonnull align 1 dereferenceable(16) poison, i64 16, i1 false)
+  %load = load volatile ptr, ptr addrspace(5) null, align 2147483648
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  call void @llvm.memcpy.p0.p5.i64(ptr noundef nonnull align 1 dereferenceable(16) @hoge, ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca1, i64 16, i1 false)
+  br label %bb2
+}
+
+declare ptr @hoge() local_unnamed_addr #1
+
+attributes #0 = { nofree norecurse noreturn nounwind memory(readwrite, target_mem0: none, target_mem1: none) "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+attributes #1 = { "uniform-work-group-size"="false" }