-
Notifications
You must be signed in to change notification settings - Fork 14.4k
AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize #145911
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/petar-avramovic/rbl-ral-tests
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,8 @@ | |
#include "GCNSubtarget.h" | ||
#include "llvm/CodeGen/GlobalISel/CSEInfo.h" | ||
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" | ||
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" | ||
#include "llvm/CodeGen/GlobalISel/Utils.h" | ||
#include "llvm/CodeGen/MachineFunctionPass.h" | ||
#include "llvm/CodeGen/MachineUniformityAnalysis.h" | ||
#include "llvm/CodeGen/TargetPassConfig.h" | ||
|
@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner { | |
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), | ||
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; | ||
|
||
bool isLaneMask(Register Reg) { | ||
const RegisterBank *RB = MRI.getRegBankOrNull(Reg); | ||
if (RB && RB->getID() == AMDGPU::VCCRegBankID) | ||
return true; | ||
bool isLaneMask(Register Reg); | ||
std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode); | ||
std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src); | ||
Register getReadAnyLaneSrc(Register Src); | ||
void replaceRegWithOrBuildCopy(Register Dst, Register Src); | ||
|
||
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); | ||
return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); | ||
} | ||
bool tryEliminateReadAnyLane(MachineInstr &Copy); | ||
void tryCombineCopy(MachineInstr &MI); | ||
void tryCombineS1AnyExt(MachineInstr &MI); | ||
}; | ||
|
||
void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { | ||
MI.eraseFromParent(); | ||
if (Optional0 && isTriviallyDead(*Optional0, MRI)) | ||
Optional0->eraseFromParent(); | ||
} | ||
bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) { | ||
const RegisterBank *RB = MRI.getRegBankOrNull(Reg); | ||
if (RB && RB->getID() == AMDGPU::VCCRegBankID) | ||
return true; | ||
|
||
std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) { | ||
MachineInstr *MatchMI = MRI.getVRegDef(Src); | ||
if (MatchMI->getOpcode() != Opcode) | ||
return {nullptr, Register()}; | ||
return {MatchMI, MatchMI->getOperand(1).getReg()}; | ||
} | ||
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); | ||
return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); | ||
} | ||
|
||
void tryCombineCopy(MachineInstr &MI) { | ||
Register Dst = MI.getOperand(0).getReg(); | ||
Register Src = MI.getOperand(1).getReg(); | ||
// Skip copies of physical registers. | ||
if (!Dst.isVirtual() || !Src.isVirtual()) | ||
return; | ||
|
||
// This is a cross bank copy, sgpr S1 to lane mask. | ||
// | ||
// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) | ||
// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) | ||
// -> | ||
// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) | ||
if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { | ||
auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); | ||
assert(Trunc && MRI.getType(TruncS32Src) == S32 && | ||
"sgpr S1 must be result of G_TRUNC of sgpr S32"); | ||
|
||
B.setInstr(MI); | ||
// Ensure that truncated bits in BoolSrc are 0. | ||
auto One = B.buildConstant({SgprRB, S32}, 1); | ||
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); | ||
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); | ||
cleanUpAfterCombine(MI, Trunc); | ||
return; | ||
} | ||
std::pair<MachineInstr *, Register> | ||
AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { | ||
MachineInstr *MatchMI = MRI.getVRegDef(Src); | ||
if (MatchMI->getOpcode() != Opcode) | ||
return {nullptr, Register()}; | ||
return {MatchMI, MatchMI->getOperand(1).getReg()}; | ||
} | ||
|
||
std::pair<GUnmerge *, int> | ||
AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { | ||
MachineInstr *ReadAnyLane = MRI.getVRegDef(Src); | ||
if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE) | ||
return {nullptr, -1}; | ||
|
||
Register RALSrc = ReadAnyLane->getOperand(1).getReg(); | ||
if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI)) | ||
return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; | ||
|
||
// Src = G_AMDGPU_READANYLANE RALSrc | ||
// Dst = COPY Src | ||
// -> | ||
// Dst = RALSrc | ||
if (MRI.getRegBankOrNull(Dst) == VgprRB && | ||
MRI.getRegBankOrNull(Src) == SgprRB) { | ||
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); | ||
if (!RAL) | ||
return; | ||
|
||
assert(MRI.getRegBank(RALSrc) == VgprRB); | ||
MRI.replaceRegWith(Dst, RALSrc); | ||
cleanUpAfterCombine(MI, RAL); | ||
return; | ||
return {nullptr, -1}; | ||
} | ||
|
||
Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { | ||
// Src = G_AMDGPU_READANYLANE RALSrc | ||
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); | ||
if (RAL) | ||
return RALSrc; | ||
|
||
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc | ||
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr | ||
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr | ||
// Src G_MERGE_VALUES LoSgpr, HiSgpr | ||
auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI); | ||
if (Merge) { | ||
unsigned NumElts = Merge->getNumSources(); | ||
auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); | ||
if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) | ||
return {}; | ||
|
||
// Check if all elements are from same unmerge and there is no shuffling. | ||
for (unsigned i = 1; i < NumElts; ++i) { | ||
auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); | ||
if (UnmergeI != Unmerge || (unsigned)IdxI != i) | ||
return {}; | ||
} | ||
return Unmerge->getSourceReg(); | ||
} | ||
|
||
void tryCombineS1AnyExt(MachineInstr &MI) { | ||
// %Src:sgpr(S1) = G_TRUNC %TruncSrc | ||
// %Dst = G_ANYEXT %Src:sgpr(S1) | ||
// -> | ||
// %Dst = G_... %TruncSrc | ||
Register Dst = MI.getOperand(0).getReg(); | ||
Register Src = MI.getOperand(1).getReg(); | ||
if (MRI.getType(Src) != S1) | ||
return; | ||
|
||
auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); | ||
if (!Trunc) | ||
return; | ||
|
||
LLT DstTy = MRI.getType(Dst); | ||
LLT TruncSrcTy = MRI.getType(TruncSrc); | ||
|
||
if (DstTy == TruncSrcTy) { | ||
MRI.replaceRegWith(Dst, TruncSrc); | ||
cleanUpAfterCombine(MI, Trunc); | ||
return; | ||
} | ||
// SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc | ||
// SourceReg G_MERGE_VALUES ..., SrcRegIdx, ... | ||
// ..., Src, ... = G_UNMERGE_VALUES SourceReg | ||
auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI); | ||
if (!UnMerge) | ||
return {}; | ||
|
||
int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr); | ||
Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI); | ||
if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources()) | ||
return {}; | ||
|
||
Register SrcRegIdx = Merge->getSourceReg(Idx); | ||
if (MRI.getType(Src) != MRI.getType(SrcRegIdx)) | ||
return {}; | ||
|
||
auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE); | ||
if (RALEl) | ||
return RALElSrc; | ||
|
||
return {}; | ||
} | ||
|
||
void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst, | ||
Register Src) { | ||
if (Dst.isVirtual()) | ||
MRI.replaceRegWith(Dst, Src); | ||
else | ||
B.buildCopy(Dst, Src); | ||
} | ||
|
||
bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane( | ||
MachineInstr &Copy) { | ||
Register Dst = Copy.getOperand(0).getReg(); | ||
Register Src = Copy.getOperand(1).getReg(); | ||
|
||
// Skip non-vgpr Dst | ||
if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB) | ||
: !TRI.isVGPR(MRI, Dst)) | ||
return false; | ||
|
||
// Skip physical source registers and source registers with register class | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It is quite common |
||
if (!Src.isVirtual() || MRI.getRegClassOrNull(Src)) | ||
return false; | ||
|
||
Register RALDst = Src; | ||
MachineInstr &SrcMI = *MRI.getVRegDef(Src); | ||
if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) | ||
RALDst = SrcMI.getOperand(1).getReg(); | ||
|
||
Register RALSrc = getReadAnyLaneSrc(RALDst); | ||
if (!RALSrc) | ||
return false; | ||
|
||
B.setInstr(Copy); | ||
if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { | ||
// Src = READANYLANE RALSrc Src = READANYLANE RALSrc | ||
// Dst = Copy Src $Dst = Copy Src | ||
// -> -> | ||
// Dst = RALSrc $Dst = Copy RALSrc | ||
replaceRegWithOrBuildCopy(Dst, RALSrc); | ||
} else { | ||
// RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc | ||
// Src = G_BITCAST RALDst Src = G_BITCAST RALDst | ||
// Dst = Copy Src Dst = Copy Src | ||
// -> -> | ||
// NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst | ||
// Dst = NewVgpr $Dst = Copy NewVgpr | ||
auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); | ||
replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0)); | ||
} | ||
|
||
eraseInstr(Copy, MRI); | ||
return true; | ||
} | ||
|
||
void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) { | ||
if (tryEliminateReadAnyLane(MI)) | ||
return; | ||
|
||
Register Dst = MI.getOperand(0).getReg(); | ||
Register Src = MI.getOperand(1).getReg(); | ||
// Skip copies of physical registers. | ||
if (!Dst.isVirtual() || !Src.isVirtual()) | ||
return; | ||
|
||
// This is a cross bank copy, sgpr S1 to lane mask. | ||
// | ||
// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) | ||
// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) | ||
// -> | ||
// %BoolSrc:sgpr(s32) = G_AND %TruncS32Src:sgpr(s32), 1 | ||
// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %BoolSrc:sgpr(s32) | ||
if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { | ||
auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); | ||
assert(Trunc && MRI.getType(TruncS32Src) == S32 && | ||
"sgpr S1 must be result of G_TRUNC of sgpr S32"); | ||
Comment on lines
+281
to
+283
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this really guaranteed? Couldn't the source program have an LLVM IR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are correct, not guaranteed. Was trying to make one with trunc from i64 to i1 but did not find one yet. Here is one test that actually hits that assert.
Original idea where that assert comes from trunc created regbanklegalize. Legal i1 used by something that is lowered by divergence lowering. Uniform i1 is lowered as sgpr S32 that is truncated to S1. |
||
|
||
B.setInstr(MI); | ||
// Ensure that truncated bits in BoolSrc are 0. | ||
auto One = B.buildConstant({SgprRB, S32}, 1); | ||
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); | ||
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); | ||
eraseInstr(MI, MRI); | ||
} | ||
} | ||
|
||
if (DstTy == S32 && TruncSrcTy == S64) { | ||
auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); | ||
MRI.replaceRegWith(Dst, Unmerge.getReg(0)); | ||
cleanUpAfterCombine(MI, Trunc); | ||
return; | ||
} | ||
void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) { | ||
// %Src:sgpr(S1) = G_TRUNC %TruncSrc | ||
// %Dst = G_ANYEXT %Src:sgpr(S1) | ||
// -> | ||
// %Dst = G_... %TruncSrc | ||
Register Dst = MI.getOperand(0).getReg(); | ||
Register Src = MI.getOperand(1).getReg(); | ||
if (MRI.getType(Src) != S1) | ||
return; | ||
|
||
auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); | ||
if (!Trunc) | ||
return; | ||
|
||
LLT DstTy = MRI.getType(Dst); | ||
LLT TruncSrcTy = MRI.getType(TruncSrc); | ||
|
||
if (DstTy == TruncSrcTy) { | ||
MRI.replaceRegWith(Dst, TruncSrc); | ||
eraseInstr(MI, MRI); | ||
return; | ||
} | ||
|
||
if (DstTy == S64 && TruncSrcTy == S32) { | ||
B.buildMergeLikeInstr(MI.getOperand(0).getReg(), | ||
{TruncSrc, B.buildUndef({SgprRB, S32})}); | ||
cleanUpAfterCombine(MI, Trunc); | ||
return; | ||
} | ||
B.setInstr(MI); | ||
|
||
if (DstTy == S32 && TruncSrcTy == S16) { | ||
B.buildAnyExt(Dst, TruncSrc); | ||
cleanUpAfterCombine(MI, Trunc); | ||
return; | ||
} | ||
if (DstTy == S32 && TruncSrcTy == S64) { | ||
auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); | ||
MRI.replaceRegWith(Dst, Unmerge.getReg(0)); | ||
eraseInstr(MI, MRI); | ||
return; | ||
} | ||
|
||
if (DstTy == S16 && TruncSrcTy == S32) { | ||
B.buildTrunc(Dst, TruncSrc); | ||
cleanUpAfterCombine(MI, Trunc); | ||
return; | ||
} | ||
if (DstTy == S64 && TruncSrcTy == S32) { | ||
B.buildMergeLikeInstr(MI.getOperand(0).getReg(), | ||
{TruncSrc, B.buildUndef({SgprRB, S32})}); | ||
eraseInstr(MI, MRI); | ||
return; | ||
} | ||
|
||
llvm_unreachable("missing anyext + trunc combine"); | ||
if (DstTy == S32 && TruncSrcTy == S16) { | ||
B.buildAnyExt(Dst, TruncSrc); | ||
eraseInstr(MI, MRI); | ||
return; | ||
} | ||
}; | ||
|
||
if (DstTy == S16 && TruncSrcTy == S32) { | ||
B.buildTrunc(Dst, TruncSrc); | ||
eraseInstr(MI, MRI); | ||
return; | ||
} | ||
|
||
llvm_unreachable("missing anyext + trunc combine"); | ||
} | ||
|
||
// Search through MRI for virtual registers with sgpr register bank and S1 LLT. | ||
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a better name for this than
tryMatch
? Come to think of it, can the generic matching infrastructure be used for this?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can use mi_match, this is shorter because we use auto instead of declaring what we want to capture. To me at least, this has nicer formatting.
How about matchInstAndGetSrc?