Skip to content

Commit 0bba171

Browse files
vg0204cdevadas
andauthored
[AMDGPU][WaveTransform] Migrate VcndmaskVcmpExecMask fold into SIFoldOperand pass (#369)
As in case of lateAMDGPUWaveTransform pipeline, the SIOptimizeExecMaskingPreRA should be moved just before SGPR allocation when per-lane VGPR allocation has been handled. So, we need to ensure that any kind of optimization dealing with EXEC mask around VGPRs has to be handlded way before, just after Instruction Selection appropriately. Thus, we migrate optimizeVcndVcmpPair from SIOptimizeExecMaksingPreRA into SIFoldOperands pass invoked during MachineSSAOptimization pipeline. --------- Co-authored-by: Christudasan Devadasan <[email protected]>
1 parent d2eb2cf commit 0bba171

32 files changed

+8855
-10425
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include "SIFoldOperands.h"
1212
#include "AMDGPU.h"
13+
#include "AMDGPULaneMaskUtils.h"
1314
#include "GCNSubtarget.h"
1415
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1516
#include "SIInstrInfo.h"
@@ -265,6 +266,7 @@ class SIFoldOperandsImpl {
265266
bool tryFoldRegSequence(MachineInstr &MI);
266267
bool tryFoldPhiAGPR(MachineInstr &MI);
267268
bool tryFoldLoad(MachineInstr &MI);
269+
bool tryOptimizeVcndVcmpPair(MachineInstr &MI);
268270

269271
bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
270272

@@ -2784,6 +2786,140 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
27842786
return Changed;
27852787
}
27862788

2789+
2790+
static bool isPhysRegDefBetween(const SIRegisterInfo *TRI, Register Reg,
2791+
const MachineInstr &Start, const MachineInstr &End) {
2792+
if(Start.getParent() != End.getParent())
2793+
return false;
2794+
2795+
auto Itr = ++Start.getIterator();
2796+
auto EndItr = End.getIterator();
2797+
while (Itr != EndItr) {
2798+
if ((*Itr).modifiesRegister(Reg, TRI))
2799+
return true;
2800+
Itr++;
2801+
}
2802+
2803+
return false;
2804+
}
2805+
2806+
// Optimize sequence
2807+
// %sel = V_CNDMASK_B32_e64 0, 1, %cc
2808+
// %cmp = V_CMP_NE_U32 1, %sel
2809+
// $vcc = S_AND_B64 $exec, %cmp
2810+
// S_CBRANCH_VCC[N]Z
2811+
// =>
2812+
// $vcc = S_ANDN2_B64 $exec, %cc
2813+
// S_CBRANCH_VCC[N]Z
2814+
//
2815+
// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
2816+
// rebuildSetCC(). S_AND_B64 with exec is a required part of the pattern since
2817+
// V_CNDMASK_B32 writes zeroes for inactive lanes.
2818+
bool SIFoldOperandsImpl::tryOptimizeVcndVcmpPair(MachineInstr &MI) {
2819+
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(*ST);
2820+
Register ExecReg = LMC.ExecReg;
2821+
2822+
if (MI.getOpcode() != LMC.AndOpc || !MI.getOperand(1).isReg() ||
2823+
!MI.getOperand(2).isReg())
2824+
return false;
2825+
2826+
if(!MI.getOperand(0).isReg() || !MI.getOperand(0).getReg().isPhysical() ||
2827+
MI.getOperand(0).getReg() != LMC.VccReg)
2828+
return false;
2829+
2830+
auto I = llvm::find_if(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
2831+
[](const MachineInstr &MI) {
2832+
unsigned Opc = MI.getOpcode();
2833+
return Opc == AMDGPU::S_CBRANCH_VCCZ ||
2834+
Opc == AMDGPU::S_CBRANCH_VCCNZ; });
2835+
if (I == MRI->use_instr_nodbg_end())
2836+
return false;
2837+
2838+
MachineOperand *AndCmp = &MI.getOperand(1);
2839+
Register CmpReg = AndCmp->getReg();
2840+
if (CmpReg == ExecReg) {
2841+
AndCmp = &MI.getOperand(2);
2842+
CmpReg = AndCmp->getReg();
2843+
} else if (MI.getOperand(2).getReg() != ExecReg) {
2844+
return false;
2845+
}
2846+
2847+
auto CmpIt = llvm::find_if(MRI->def_instructions(CmpReg),
2848+
[&MI](const MachineInstr &DefMI) {
2849+
unsigned Opc = DefMI.getOpcode();
2850+
return ((Opc == AMDGPU::V_CMP_NE_U32_e32 ||
2851+
Opc == AMDGPU::V_CMP_NE_U32_e64) &&
2852+
DefMI.getParent() == MI.getParent());
2853+
});
2854+
if (CmpIt == MRI->def_instr_end())
2855+
return false;
2856+
MachineInstr &Cmp = *CmpIt;
2857+
2858+
// Check for cmpReg is physical only vcc/vcc_lo possible and not redefined
2859+
// uptil s_and_b32.
2860+
if (CmpReg.isPhysical()) {
2861+
assert(CmpReg == LMC.VccReg && "CmpReg should be VCC or VCC_LO.");
2862+
if (isPhysRegDefBetween(TRI, CmpReg, Cmp, MI))
2863+
return false;
2864+
}
2865+
2866+
MachineOperand *Op1 = TII->getNamedOperand(Cmp, AMDGPU::OpName::src0);
2867+
MachineOperand *Op2 = TII->getNamedOperand(Cmp, AMDGPU::OpName::src1);
2868+
if (Op1->isImm() && Op2->isReg())
2869+
std::swap(Op1, Op2);
2870+
if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
2871+
return false;
2872+
2873+
Register SelReg = Op1->getReg();
2874+
if (SelReg.isPhysical())
2875+
return false;
2876+
2877+
MachineInstr *Sel = MRI->getVRegDef(SelReg);
2878+
if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
2879+
return false;
2880+
2881+
if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
2882+
TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
2883+
return false;
2884+
2885+
Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
2886+
Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
2887+
MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
2888+
if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
2889+
Op1->getImm() != 0 || Op2->getImm() != 1)
2890+
return false;
2891+
2892+
2893+
if (CmpReg.isPhysical()) {
2894+
bool UsedByAndOnly = true;
2895+
auto Itr = ++Cmp.getIterator();
2896+
auto AndItr = MI.getIterator();
2897+
while (Itr != AndItr) {
2898+
if ((*Itr).readsRegister(CmpReg, TRI)) {
2899+
UsedByAndOnly = false;
2900+
break;
2901+
}
2902+
Itr++;
2903+
}
2904+
2905+
if (UsedByAndOnly)
2906+
Cmp.eraseFromParent();
2907+
} else if ((CmpReg.isVirtual() && MRI->hasOneNonDBGUse(CmpReg))) {
2908+
Cmp.eraseFromParent();
2909+
}
2910+
2911+
MI.setDesc(TII->get(LMC.AndN2Opc));
2912+
AndCmp->setReg(CC->getReg());
2913+
AndCmp->setSubReg(CC->getSubReg());
2914+
AndCmp->setIsUndef(CC->isUndef());
2915+
AndCmp->setIsKill(CC->isKill());
2916+
2917+
if (MRI->use_nodbg_empty(SelReg))
2918+
Sel->eraseFromParent();
2919+
2920+
return true;
2921+
}
2922+
27872923
bool SIFoldOperandsImpl::run(MachineFunction &MF) {
27882924
this->MF = &MF;
27892925
MRI = &MF.getRegInfo();
@@ -2825,6 +2961,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
28252961
continue;
28262962
}
28272963

2964+
if (tryOptimizeVcndVcmpPair(MI)) {
2965+
Changed = true;
2966+
continue;
2967+
}
2968+
28282969
if (TII->isFoldableCopy(MI)) {
28292970
Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
28302971
continue;

llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp

Lines changed: 0 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,8 @@ class SIOptimizeExecMaskingPreRA {
3636
LiveIntervals *LIS;
3737
const AMDGPU::LaneMaskConstants &LMC;
3838

39-
MCRegister CondReg;
4039
MCRegister ExecReg;
4140

42-
bool optimizeVcndVcmpPair(MachineBasicBlock &MBB);
4341
bool optimizeElseBranch(MachineBasicBlock &MBB);
4442

4543
public:
@@ -88,193 +86,6 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
8886
return new SIOptimizeExecMaskingPreRALegacy();
8987
}
9088

91-
// See if there is a def between \p AndIdx and \p SelIdx that needs to live
92-
// beyond \p AndIdx.
93-
static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx,
94-
SlotIndex SelIdx) {
95-
LiveQueryResult AndLRQ = LR.Query(AndIdx);
96-
return (!AndLRQ.isKill() && AndLRQ.valueIn() != LR.Query(SelIdx).valueOut());
97-
}
98-
99-
// FIXME: Why do we bother trying to handle physical registers here?
100-
static bool isDefBetween(const SIRegisterInfo &TRI,
101-
LiveIntervals *LIS, Register Reg,
102-
const MachineInstr &Sel, const MachineInstr &And) {
103-
SlotIndex AndIdx = LIS->getInstructionIndex(And).getRegSlot();
104-
SlotIndex SelIdx = LIS->getInstructionIndex(Sel).getRegSlot();
105-
106-
if (Reg.isVirtual())
107-
return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx);
108-
109-
for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) {
110-
if (isDefBetween(LIS->getRegUnit(Unit), AndIdx, SelIdx))
111-
return true;
112-
}
113-
114-
return false;
115-
}
116-
117-
// Optimize sequence
118-
// %sel = V_CNDMASK_B32_e64 0, 1, %cc
119-
// %cmp = V_CMP_NE_U32 1, %sel
120-
// $vcc = S_AND_B64 $exec, %cmp
121-
// S_CBRANCH_VCC[N]Z
122-
// =>
123-
// $vcc = S_ANDN2_B64 $exec, %cc
124-
// S_CBRANCH_VCC[N]Z
125-
//
126-
// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
127-
// rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
128-
// only 3 first instructions are really needed. S_AND_B64 with exec is a
129-
// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
130-
// lanes.
131-
//
132-
// Returns true on success.
133-
bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
134-
auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
135-
unsigned Opc = MI.getOpcode();
136-
return Opc == AMDGPU::S_CBRANCH_VCCZ ||
137-
Opc == AMDGPU::S_CBRANCH_VCCNZ; });
138-
if (I == MBB.terminators().end())
139-
return false;
140-
141-
auto *And =
142-
TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS);
143-
if (!And || And->getOpcode() != LMC.AndOpc || !And->getOperand(1).isReg() ||
144-
!And->getOperand(2).isReg())
145-
return false;
146-
147-
MachineOperand *AndCC = &And->getOperand(1);
148-
Register CmpReg = AndCC->getReg();
149-
unsigned CmpSubReg = AndCC->getSubReg();
150-
if (CmpReg == Register(ExecReg)) {
151-
AndCC = &And->getOperand(2);
152-
CmpReg = AndCC->getReg();
153-
CmpSubReg = AndCC->getSubReg();
154-
} else if (And->getOperand(2).getReg() != Register(ExecReg)) {
155-
return false;
156-
}
157-
158-
auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, *MRI, LIS);
159-
if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
160-
Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
161-
Cmp->getParent() != And->getParent())
162-
return false;
163-
164-
MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
165-
MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
166-
if (Op1->isImm() && Op2->isReg())
167-
std::swap(Op1, Op2);
168-
if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
169-
return false;
170-
171-
Register SelReg = Op1->getReg();
172-
if (SelReg.isPhysical())
173-
return false;
174-
175-
auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS);
176-
if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
177-
return false;
178-
179-
if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
180-
TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
181-
return false;
182-
183-
Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
184-
Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
185-
MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
186-
if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
187-
Op1->getImm() != 0 || Op2->getImm() != 1)
188-
return false;
189-
190-
Register CCReg = CC->getReg();
191-
192-
// If there was a def between the select and the and, we would need to move it
193-
// to fold this.
194-
if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
195-
return false;
196-
197-
// Cannot safely mirror live intervals with PHI nodes, so check for these
198-
// before optimization.
199-
SlotIndex SelIdx = LIS->getInstructionIndex(*Sel);
200-
LiveInterval *SelLI = &LIS->getInterval(SelReg);
201-
if (llvm::any_of(SelLI->vnis(),
202-
[](const VNInfo *VNI) {
203-
return VNI->isPHIDef();
204-
}))
205-
return false;
206-
207-
// TODO: Guard against implicit def operands?
208-
LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
209-
<< *And);
210-
211-
MachineInstr *Andn2 =
212-
BuildMI(MBB, *And, And->getDebugLoc(), TII->get(LMC.AndN2Opc),
213-
And->getOperand(0).getReg())
214-
.addReg(ExecReg)
215-
.addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
216-
MachineOperand &AndSCC = And->getOperand(3);
217-
assert(AndSCC.getReg() == AMDGPU::SCC);
218-
MachineOperand &Andn2SCC = Andn2->getOperand(3);
219-
assert(Andn2SCC.getReg() == AMDGPU::SCC);
220-
Andn2SCC.setIsDead(AndSCC.isDead());
221-
222-
SlotIndex AndIdx = LIS->ReplaceMachineInstrInMaps(*And, *Andn2);
223-
And->eraseFromParent();
224-
225-
LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
226-
227-
// Update live intervals for CCReg before potentially removing CmpReg/SelReg,
228-
// and their associated liveness information.
229-
SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
230-
if (CCReg.isVirtual()) {
231-
LiveInterval &CCLI = LIS->getInterval(CCReg);
232-
auto CCQ = CCLI.Query(SelIdx.getRegSlot());
233-
if (CCQ.valueIn()) {
234-
LIS->removeInterval(CCReg);
235-
LIS->createAndComputeVirtRegInterval(CCReg);
236-
}
237-
} else
238-
LIS->removeAllRegUnitsForPhysReg(CCReg);
239-
240-
// Try to remove compare. Cmp value should not used in between of cmp
241-
// and s_and_b64 if VCC or just unused if any other register.
242-
LiveInterval *CmpLI = CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr;
243-
if ((CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
244-
(CmpReg == Register(CondReg) &&
245-
std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
246-
[&](const MachineInstr &MI) {
247-
return MI.readsRegister(CondReg, TRI);
248-
}))) {
249-
LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
250-
if (CmpLI)
251-
LIS->removeVRegDefAt(*CmpLI, CmpIdx.getRegSlot());
252-
LIS->RemoveMachineInstrFromMaps(*Cmp);
253-
Cmp->eraseFromParent();
254-
255-
// Try to remove v_cndmask_b32.
256-
// Kill status must be checked before shrinking the live range.
257-
bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill();
258-
LIS->shrinkToUses(SelLI);
259-
bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
260-
if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) {
261-
LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
262-
263-
LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
264-
LIS->RemoveMachineInstrFromMaps(*Sel);
265-
bool ShrinkSel = Sel->getOperand(0).readsReg();
266-
Sel->eraseFromParent();
267-
if (ShrinkSel) {
268-
// The result of the V_CNDMASK was a subreg def which counted as a read
269-
// from the other parts of the reg. Shrink their live ranges.
270-
LIS->shrinkToUses(SelLI);
271-
}
272-
}
273-
}
274-
275-
return true;
276-
}
277-
27889
// Optimize sequence
27990
// %dst = S_OR_SAVEEXEC %src
28091
// ... instructions not modifying exec ...
@@ -368,7 +179,6 @@ bool SIOptimizeExecMaskingPreRALegacy::runOnMachineFunction(
368179
}
369180

370181
bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) {
371-
CondReg = MCRegister::from(LMC.VccReg);
372182
ExecReg = MCRegister::from(LMC.ExecReg);
373183

374184
DenseSet<Register> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
@@ -381,13 +191,6 @@ bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) {
381191
Changed = true;
382192
}
383193

384-
if (optimizeVcndVcmpPair(MBB)) {
385-
RecalcRegs.insert(AMDGPU::VCC_LO);
386-
RecalcRegs.insert(AMDGPU::VCC_HI);
387-
RecalcRegs.insert(AMDGPU::SCC);
388-
Changed = true;
389-
}
390-
391194
// Try to remove unneeded instructions before s_endpgm.
392195
if (MBB.succ_empty()) {
393196
if (MBB.empty())

0 commit comments

Comments
 (0)