Skip to content

Commit 7c2efb4

Browse files
AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize
1 parent 57e9aec commit 7c2efb4

File tree

6 files changed

+246
-186
lines changed

6 files changed

+246
-186
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 213 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#include "GCNSubtarget.h"
2424
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
2525
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
26+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
27+
#include "llvm/CodeGen/GlobalISel/Utils.h"
2628
#include "llvm/CodeGen/MachineFunctionPass.h"
2729
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2830
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner {
115117
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
116118
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
117119

118-
bool isLaneMask(Register Reg) {
119-
const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
120-
if (RB && RB->getID() == AMDGPU::VCCRegBankID)
121-
return true;
120+
bool isLaneMask(Register Reg);
121+
std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode);
122+
std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src);
123+
Register getReadAnyLaneSrc(Register Src);
124+
void replaceRegWithOrBuildCopy(Register Dst, Register Src);
122125

123-
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
124-
return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
125-
}
126+
bool tryEliminateReadAnyLane(MachineInstr &Copy);
127+
void tryCombineCopy(MachineInstr &MI);
128+
void tryCombineS1AnyExt(MachineInstr &MI);
129+
};
126130

127-
void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
128-
MI.eraseFromParent();
129-
if (Optional0 && isTriviallyDead(*Optional0, MRI))
130-
Optional0->eraseFromParent();
131-
}
131+
bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
132+
const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
133+
if (RB && RB->getID() == AMDGPU::VCCRegBankID)
134+
return true;
132135

133-
std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
134-
MachineInstr *MatchMI = MRI.getVRegDef(Src);
135-
if (MatchMI->getOpcode() != Opcode)
136-
return {nullptr, Register()};
137-
return {MatchMI, MatchMI->getOperand(1).getReg()};
138-
}
136+
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
137+
return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
138+
}
139139

140-
void tryCombineCopy(MachineInstr &MI) {
141-
Register Dst = MI.getOperand(0).getReg();
142-
Register Src = MI.getOperand(1).getReg();
143-
// Skip copies of physical registers.
144-
if (!Dst.isVirtual() || !Src.isVirtual())
145-
return;
146-
147-
// This is a cross bank copy, sgpr S1 to lane mask.
148-
//
149-
// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
150-
// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
151-
// ->
152-
// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
153-
if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
154-
auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
155-
assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
156-
"sgpr S1 must be result of G_TRUNC of sgpr S32");
157-
158-
B.setInstr(MI);
159-
// Ensure that truncated bits in BoolSrc are 0.
160-
auto One = B.buildConstant({SgprRB, S32}, 1);
161-
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
162-
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
163-
cleanUpAfterCombine(MI, Trunc);
164-
return;
165-
}
140+
std::pair<MachineInstr *, Register>
141+
AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
142+
MachineInstr *MatchMI = MRI.getVRegDef(Src);
143+
if (MatchMI->getOpcode() != Opcode)
144+
return {nullptr, Register()};
145+
return {MatchMI, MatchMI->getOperand(1).getReg()};
146+
}
147+
148+
std::pair<GUnmerge *, int>
149+
AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
150+
MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
151+
if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
152+
return {nullptr, -1};
153+
154+
Register RALSrc = ReadAnyLane->getOperand(1).getReg();
155+
if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
156+
return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
166157

167-
// Src = G_AMDGPU_READANYLANE RALSrc
168-
// Dst = COPY Src
169-
// ->
170-
// Dst = RALSrc
171-
if (MRI.getRegBankOrNull(Dst) == VgprRB &&
172-
MRI.getRegBankOrNull(Src) == SgprRB) {
173-
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
174-
if (!RAL)
175-
return;
176-
177-
assert(MRI.getRegBank(RALSrc) == VgprRB);
178-
MRI.replaceRegWith(Dst, RALSrc);
179-
cleanUpAfterCombine(MI, RAL);
180-
return;
158+
return {nullptr, -1};
159+
}
160+
161+
Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
162+
// Src = G_AMDGPU_READANYLANE RALSrc
163+
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
164+
if (RAL)
165+
return RALSrc;
166+
167+
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
168+
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
169+
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr
170+
// Src G_MERGE_VALUES LoSgpr, HiSgpr
171+
auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
172+
if (Merge) {
173+
unsigned NumElts = Merge->getNumSources();
174+
auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
175+
if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
176+
return {};
177+
178+
// Check if all elements are from same unmerge and there is no shuffling.
179+
for (unsigned i = 1; i < NumElts; ++i) {
180+
auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
181+
if (UnmergeI != Unmerge || (unsigned)IdxI != i)
182+
return {};
181183
}
184+
return Unmerge->getSourceReg();
182185
}
183186

184-
void tryCombineS1AnyExt(MachineInstr &MI) {
185-
// %Src:sgpr(S1) = G_TRUNC %TruncSrc
186-
// %Dst = G_ANYEXT %Src:sgpr(S1)
187-
// ->
188-
// %Dst = G_... %TruncSrc
189-
Register Dst = MI.getOperand(0).getReg();
190-
Register Src = MI.getOperand(1).getReg();
191-
if (MRI.getType(Src) != S1)
192-
return;
193-
194-
auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
195-
if (!Trunc)
196-
return;
197-
198-
LLT DstTy = MRI.getType(Dst);
199-
LLT TruncSrcTy = MRI.getType(TruncSrc);
200-
201-
if (DstTy == TruncSrcTy) {
202-
MRI.replaceRegWith(Dst, TruncSrc);
203-
cleanUpAfterCombine(MI, Trunc);
204-
return;
205-
}
187+
// ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge
188+
// SgprI = G_AMDGPU_READANYLANE VgprI
189+
// SgprLarge G_MERGE_VALUES ..., SgprI, ...
190+
// ..., Src, ... = G_UNMERGE_VALUES SgprLarge
191+
auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
192+
if (!UnMerge)
193+
return {};
194+
195+
int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
196+
Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
197+
if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources())
198+
return {};
199+
200+
Register SrcRegIdx = Merge->getSourceReg(Idx);
201+
if (MRI.getType(Src) != MRI.getType(SrcRegIdx))
202+
return {};
203+
204+
auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE);
205+
if (RALEl)
206+
return RALElSrc;
207+
208+
return {};
209+
}
210+
211+
void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst,
212+
Register Src) {
213+
if (Dst.isVirtual())
214+
MRI.replaceRegWith(Dst, Src);
215+
else
216+
B.buildCopy(Dst, Src);
217+
}
218+
219+
bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
220+
MachineInstr &Copy) {
221+
Register Dst = Copy.getOperand(0).getReg();
222+
Register Src = Copy.getOperand(1).getReg();
223+
224+
// Skip non-vgpr Dst
225+
if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB)
226+
: !TRI.isVGPR(MRI, Dst))
227+
return false;
228+
229+
// Skip physical source registers and source registers with register class
230+
if (!Src.isVirtual() || MRI.getRegClassOrNull(Src))
231+
return false;
232+
233+
Register RALDst = Src;
234+
MachineInstr &SrcMI = *MRI.getVRegDef(Src);
235+
if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
236+
RALDst = SrcMI.getOperand(1).getReg();
237+
238+
Register RALSrc = getReadAnyLaneSrc(RALDst);
239+
if (!RALSrc)
240+
return false;
241+
242+
B.setInstr(Copy);
243+
if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
244+
// Src = READANYLANE RALSrc Src = READANYLANE RALSrc
245+
// Dst = Copy Src $Dst = Copy Src
246+
// -> ->
247+
// Dst = RALSrc $Dst = Copy RALSrc
248+
replaceRegWithOrBuildCopy(Dst, RALSrc);
249+
} else {
250+
// RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
251+
// Src = G_BITCAST RALDst Src = G_BITCAST RALDst
252+
// Dst = Copy Src Dst = Copy Src
253+
// -> ->
254+
// NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
255+
// Dst = NewVgpr $Dst = Copy NewVgpr
256+
auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
257+
replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
258+
}
259+
260+
eraseInstr(Copy, MRI);
261+
return true;
262+
}
263+
264+
void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) {
265+
if (tryEliminateReadAnyLane(MI))
266+
return;
267+
268+
Register Dst = MI.getOperand(0).getReg();
269+
Register Src = MI.getOperand(1).getReg();
270+
// Skip copies of physical registers.
271+
if (!Dst.isVirtual() || !Src.isVirtual())
272+
return;
273+
274+
// This is a cross bank copy, sgpr S1 to lane mask.
275+
//
276+
// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
277+
// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
278+
// ->
279+
// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
280+
if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
281+
auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
282+
assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
283+
"sgpr S1 must be result of G_TRUNC of sgpr S32");
206284

207285
B.setInstr(MI);
286+
// Ensure that truncated bits in BoolSrc are 0.
287+
auto One = B.buildConstant({SgprRB, S32}, 1);
288+
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
289+
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
290+
eraseInstr(MI, MRI);
291+
}
292+
}
208293

209-
if (DstTy == S32 && TruncSrcTy == S64) {
210-
auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
211-
MRI.replaceRegWith(Dst, Unmerge.getReg(0));
212-
cleanUpAfterCombine(MI, Trunc);
213-
return;
214-
}
294+
void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) {
295+
// %Src:sgpr(S1) = G_TRUNC %TruncSrc
296+
// %Dst = G_ANYEXT %Src:sgpr(S1)
297+
// ->
298+
// %Dst = G_... %TruncSrc
299+
Register Dst = MI.getOperand(0).getReg();
300+
Register Src = MI.getOperand(1).getReg();
301+
if (MRI.getType(Src) != S1)
302+
return;
303+
304+
auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
305+
if (!Trunc)
306+
return;
307+
308+
LLT DstTy = MRI.getType(Dst);
309+
LLT TruncSrcTy = MRI.getType(TruncSrc);
310+
311+
if (DstTy == TruncSrcTy) {
312+
MRI.replaceRegWith(Dst, TruncSrc);
313+
eraseInstr(MI, MRI);
314+
return;
315+
}
215316

216-
if (DstTy == S64 && TruncSrcTy == S32) {
217-
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
218-
{TruncSrc, B.buildUndef({SgprRB, S32})});
219-
cleanUpAfterCombine(MI, Trunc);
220-
return;
221-
}
317+
B.setInstr(MI);
222318

223-
if (DstTy == S32 && TruncSrcTy == S16) {
224-
B.buildAnyExt(Dst, TruncSrc);
225-
cleanUpAfterCombine(MI, Trunc);
226-
return;
227-
}
319+
if (DstTy == S32 && TruncSrcTy == S64) {
320+
auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
321+
MRI.replaceRegWith(Dst, Unmerge.getReg(0));
322+
eraseInstr(MI, MRI);
323+
return;
324+
}
228325

229-
if (DstTy == S16 && TruncSrcTy == S32) {
230-
B.buildTrunc(Dst, TruncSrc);
231-
cleanUpAfterCombine(MI, Trunc);
232-
return;
233-
}
326+
if (DstTy == S64 && TruncSrcTy == S32) {
327+
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
328+
{TruncSrc, B.buildUndef({SgprRB, S32})});
329+
eraseInstr(MI, MRI);
330+
return;
331+
}
234332

235-
llvm_unreachable("missing anyext + trunc combine");
333+
if (DstTy == S32 && TruncSrcTy == S16) {
334+
B.buildAnyExt(Dst, TruncSrc);
335+
eraseInstr(MI, MRI);
336+
return;
236337
}
237-
};
338+
339+
if (DstTy == S16 && TruncSrcTy == S32) {
340+
B.buildTrunc(Dst, TruncSrc);
341+
eraseInstr(MI, MRI);
342+
return;
343+
}
344+
345+
llvm_unreachable("missing anyext + trunc combine");
346+
}
238347

239348
// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
240349
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {

0 commit comments

Comments
 (0)