Skip to content

Commit e741cd8

Browse files
authored
AMDGPU/PromoteAlloca: Fix handling of users of multiple allocas (#172771)
With recent refactoring, LDS promotion worklists for all allocas are populated upfront. In some cases, this results in a User in multiple lists. Then as each list is processed, a User might get deleted via removeFromParent, potentially leaving a dangling pointer in a subsequent worklist. Currently this only occurs for memcpy and memmove. Prior to refactoring, these were handled by DeferredInstr, and were processed after the last use of the then singular worklist. This change moves processing of DeferredInstr to after all worklists have be processed.
1 parent 4e675a0 commit e741cd8

File tree

2 files changed

+89
-9
lines changed

2 files changed

+89
-9
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,10 @@ class AMDGPUPromoteAllocaImpl {
159159
void analyzePromoteToVector(AllocaAnalysis &AA) const;
160160
void promoteAllocaToVector(AllocaAnalysis &AA);
161161
void analyzePromoteToLDS(AllocaAnalysis &AA) const;
162-
bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS);
162+
bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
163+
SetVector<IntrinsicInst *> &DeferredIntrs);
164+
void
165+
finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
163166

164167
void scoreAlloca(AllocaAnalysis &AA) const;
165168

@@ -414,6 +417,7 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
414417
// clang-format on
415418

416419
bool Changed = false;
420+
SetVector<IntrinsicInst *> DeferredIntrs;
417421
for (AllocaAnalysis &AA : Allocas) {
418422
if (AA.Vector.Ty) {
419423
const unsigned AllocaCost =
@@ -435,9 +439,11 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
435439
}
436440
}
437441

438-
if (AA.LDS.Enable && tryPromoteAllocaToLDS(AA, SufficientLDS))
442+
if (AA.LDS.Enable &&
443+
tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
439444
Changed = true;
440445
}
446+
finishDeferredAllocaToLDSPromotion(DeferredIntrs);
441447

442448
// NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
443449
// dangling pointers. If we want to reuse it past this point, the loop above
@@ -1550,8 +1556,9 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
15501556
}
15511557

15521558
// FIXME: Should try to pick the most likely to be profitable allocas first.
1553-
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
1554-
bool SufficientLDS) {
1559+
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1560+
AllocaAnalysis &AA, bool SufficientLDS,
1561+
SetVector<IntrinsicInst *> &DeferredIntrs) {
15551562
LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
15561563

15571564
// Not likely to have sufficient local memory for promotion.
@@ -1620,8 +1627,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
16201627
AA.Alloca->replaceAllUsesWith(Offset);
16211628
AA.Alloca->eraseFromParent();
16221629

1623-
SmallVector<IntrinsicInst *> DeferredIntrs;
1624-
16251630
PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
16261631

16271632
for (Value *V : AA.LDS.Worklist) {
@@ -1682,7 +1687,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
16821687
// These have 2 pointer operands. In case if second pointer also needs
16831688
// to be replaced we defer processing of these intrinsics until all
16841689
// other values are processed.
1685-
DeferredIntrs.push_back(Intr);
1690+
DeferredIntrs.insert(Intr);
16861691
continue;
16871692
case Intrinsic::memset: {
16881693
MemSetInst *MemSet = cast<MemSetInst>(Intr);
@@ -1730,7 +1735,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
17301735
}
17311736
}
17321737

1738+
return true;
1739+
}
1740+
1741+
void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1742+
SetVector<IntrinsicInst *> &DeferredIntrs) {
1743+
17331744
for (IntrinsicInst *Intr : DeferredIntrs) {
1745+
IRBuilder<> Builder(Intr);
17341746
Builder.SetInsertPoint(Intr);
17351747
Intrinsic::ID ID = Intr->getIntrinsicID();
17361748
assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
@@ -1748,6 +1760,4 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
17481760

17491761
Intr->eraseFromParent();
17501762
}
1751-
1752-
return true;
17531763
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-promote-alloca < %s | FileCheck %s
3+
4+
; This tests the case where a memcpy has two pointer operands are promoted to LDS
5+
; See `@llvm.memcpy.p5.p5.i64(... %alloca1, ... %alloca, ...)` below.
6+
7+
8+
%struct.barney = type { i8, double }
9+
10+
; Function Attrs: nofree norecurse noreturn nounwind memory(readwrite, target_mem0: none, target_mem1: none)
11+
define amdgpu_kernel void @zot() local_unnamed_addr #0 {
12+
; CHECK-LABEL: @zot(
13+
; CHECK-NEXT: bb:
14+
; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
15+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
16+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0:![0-9]+]]
17+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
18+
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
19+
; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
20+
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
21+
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
22+
; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
23+
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
24+
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
25+
; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
26+
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
27+
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
28+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [[STRUCT_BARNEY:%.*]]], ptr addrspace(3) @zot.alloca, i32 0, i32 [[TMP13]]
29+
; CHECK-NEXT: [[TMP15:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
30+
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP15]], i64 1
31+
; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(4) [[TMP16]], align 4, !invariant.load [[META0]]
32+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP15]], i64 2
33+
; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(4) [[TMP18]], align 4, !range [[RNG1]], !invariant.load [[META0]]
34+
; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP17]], 16
35+
; CHECK-NEXT: [[TMP21:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
36+
; CHECK-NEXT: [[TMP22:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
37+
; CHECK-NEXT: [[TMP23:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
38+
; CHECK-NEXT: [[TMP24:%.*]] = mul nuw nsw i32 [[TMP20]], [[TMP19]]
39+
; CHECK-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], [[TMP21]]
40+
; CHECK-NEXT: [[TMP26:%.*]] = mul nuw nsw i32 [[TMP22]], [[TMP19]]
41+
; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[TMP25]], [[TMP26]]
42+
; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], [[TMP23]]
43+
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [1024 x [[STRUCT_BARNEY]]], ptr addrspace(3) @zot.alloca1, i32 0, i32 [[TMP28]]
44+
; CHECK-NEXT: store i32 0, ptr addrspace(5) null, align 2147483648
45+
; CHECK-NEXT: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 16 dereferenceable(16) [[TMP29]], ptr addrspace(3) align 16 dereferenceable(16) [[TMP14]], i64 16, i1 false)
46+
; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 16 dereferenceable(16) [[TMP14]], ptr align 1 dereferenceable(16) poison, i64 16, i1 false)
47+
; CHECK-NEXT: [[LOAD:%.*]] = load volatile ptr, ptr addrspace(5) null, align 2147483648
48+
; CHECK-NEXT: br label [[BB2:%.*]]
49+
; CHECK: bb2:
50+
; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 1 dereferenceable(16) @hoge, ptr addrspace(3) align 16 dereferenceable(16) [[TMP29]], i64 16, i1 false)
51+
; CHECK-NEXT: br label [[BB2]]
52+
;
53+
bb:
54+
%alloca = alloca %struct.barney, align 16, addrspace(5)
55+
%alloca1 = alloca %struct.barney, align 16, addrspace(5)
56+
store i32 0, ptr addrspace(5) null, align 2147483648
57+
call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca1, ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca, i64 16, i1 false)
58+
call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca, ptr noundef nonnull align 1 dereferenceable(16) poison, i64 16, i1 false)
59+
%load = load volatile ptr, ptr addrspace(5) null, align 2147483648
60+
br label %bb2
61+
62+
bb2: ; preds = %bb2, %bb
63+
call void @llvm.memcpy.p0.p5.i64(ptr noundef nonnull align 1 dereferenceable(16) @hoge, ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca1, i64 16, i1 false)
64+
br label %bb2
65+
}
66+
67+
declare ptr @hoge() local_unnamed_addr #1
68+
69+
attributes #0 = { nofree norecurse noreturn nounwind memory(readwrite, target_mem0: none, target_mem1: none) "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
70+
attributes #1 = { "uniform-work-group-size"="false" }

0 commit comments

Comments
 (0)