ROCm
diff --git a/‎llvm/lib/Target/AMDGPU/SIFoldOperands.cpp‎
Lines changed: 141 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIFoldOperands.cpp‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp‎
Lines changed: 0 additions & 197 deletions b/‎llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp‎
Lines changed: 0 additions & 197 deletions
@@ -10,6 +10,7 @@
 
 #include "SIFoldOperands.h"
 #include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
@@ -265,6 +266,7 @@ class SIFoldOperandsImpl {
   bool tryFoldRegSequence(MachineInstr &MI);
   bool tryFoldPhiAGPR(MachineInstr &MI);
   bool tryFoldLoad(MachineInstr &MI);
+  bool tryOptimizeVcndVcmpPair(MachineInstr &MI);
 
   bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
 
@@ -2784,6 +2786,140 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
   return Changed;
 }
 
+
+static bool isPhysRegDefBetween(const SIRegisterInfo *TRI, Register Reg,
+                         const MachineInstr &Start, const MachineInstr &End) {
+  if(Start.getParent() != End.getParent())
+    return false;
+  
+  auto Itr = ++Start.getIterator();
+  auto EndItr = End.getIterator();
+  while (Itr != EndItr) {
+    if ((*Itr).modifiesRegister(Reg, TRI))
+      return true;
+    Itr++;
+  }                    
+
+  return false;
+}
+
+// Optimize sequence
+//    %sel = V_CNDMASK_B32_e64 0, 1, %cc
+//    %cmp = V_CMP_NE_U32 1, %sel
+//    $vcc = S_AND_B64 $exec, %cmp
+//    S_CBRANCH_VCC[N]Z
+// =>
+//    $vcc = S_ANDN2_B64 $exec, %cc
+//    S_CBRANCH_VCC[N]Z
+//
+// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
+// rebuildSetCC(). S_AND_B64 with exec is a required part of the pattern since 
+// V_CNDMASK_B32 writes zeroes for inactive lanes.
+bool SIFoldOperandsImpl::tryOptimizeVcndVcmpPair(MachineInstr &MI) {
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(*ST);
+  Register ExecReg = LMC.ExecReg;
+
+  if (MI.getOpcode() != LMC.AndOpc || !MI.getOperand(1).isReg() ||
+      !MI.getOperand(2).isReg())
+    return false;
+
+  if(!MI.getOperand(0).isReg() || !MI.getOperand(0).getReg().isPhysical() ||
+      MI.getOperand(0).getReg() != LMC.VccReg)
+    return false;
+
+  auto I = llvm::find_if(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()), 
+                          [](const MachineInstr &MI) {
+                           unsigned Opc = MI.getOpcode();
+                           return Opc == AMDGPU::S_CBRANCH_VCCZ ||
+                                  Opc == AMDGPU::S_CBRANCH_VCCNZ; });
+  if (I == MRI->use_instr_nodbg_end())
+    return false;
+
+  MachineOperand *AndCmp = &MI.getOperand(1);
+  Register CmpReg = AndCmp->getReg();
+  if (CmpReg == ExecReg) {
+    AndCmp = &MI.getOperand(2);
+    CmpReg = AndCmp->getReg();
+  } else if (MI.getOperand(2).getReg() != ExecReg) {
+    return false;
+  }
+
+  auto CmpIt = llvm::find_if(MRI->def_instructions(CmpReg), 
+                          [&MI](const MachineInstr &DefMI) {
+                            unsigned Opc = DefMI.getOpcode();
+                            return ((Opc == AMDGPU::V_CMP_NE_U32_e32 ||
+                                     Opc == AMDGPU::V_CMP_NE_U32_e64) &&
+                                    DefMI.getParent() == MI.getParent());
+                          });
+  if (CmpIt == MRI->def_instr_end())
+    return false;
+  MachineInstr &Cmp = *CmpIt;
+
+  // Check for cmpReg is physical only vcc/vcc_lo possible and not redefined 
+  // uptil s_and_b32.
+  if (CmpReg.isPhysical()) {
+    assert(CmpReg == LMC.VccReg && "CmpReg should be VCC or VCC_LO.");
+    if (isPhysRegDefBetween(TRI, CmpReg, Cmp, MI))
+      return false;
+  }
+
+  MachineOperand *Op1 = TII->getNamedOperand(Cmp, AMDGPU::OpName::src0);
+  MachineOperand *Op2 = TII->getNamedOperand(Cmp, AMDGPU::OpName::src1);
+  if (Op1->isImm() && Op2->isReg())
+    std::swap(Op1, Op2);
+  if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
+    return false;
+
+  Register SelReg = Op1->getReg();
+  if (SelReg.isPhysical())
+    return false;
+
+  MachineInstr *Sel = MRI->getVRegDef(SelReg);
+  if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+    return false;
+
+  if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
+      TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
+    return false;
+
+  Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
+  Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
+  MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
+  if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
+      Op1->getImm() != 0 || Op2->getImm() != 1)
+    return false;
+
+  
+  if (CmpReg.isPhysical()) {
+    bool UsedByAndOnly = true;
+    auto Itr = ++Cmp.getIterator();
+    auto AndItr = MI.getIterator();
+    while (Itr != AndItr) {
+      if ((*Itr).readsRegister(CmpReg, TRI)) {
+        UsedByAndOnly = false;
+        break;
+      }
+      Itr++;
+    }
+
+    if (UsedByAndOnly)
+      Cmp.eraseFromParent();
+  } else if ((CmpReg.isVirtual() && MRI->hasOneNonDBGUse(CmpReg))) {
+      Cmp.eraseFromParent();
+  }
+
+  MI.setDesc(TII->get(LMC.AndN2Opc));
+  AndCmp->setReg(CC->getReg());
+  AndCmp->setSubReg(CC->getSubReg());
+  AndCmp->setIsUndef(CC->isUndef());
+  AndCmp->setIsKill(CC->isKill());
+  
+  if (MRI->use_nodbg_empty(SelReg))
+    Sel->eraseFromParent();
+
+  return true;
+}
+
 bool SIFoldOperandsImpl::run(MachineFunction &MF) {
   this->MF = &MF;
   MRI = &MF.getRegInfo();
@@ -2825,6 +2961,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
         continue;
       }
 
+      if (tryOptimizeVcndVcmpPair(MI)) {
+        Changed = true;
+        continue;
+      }
+
       if (TII->isFoldableCopy(MI)) {
         Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
         continue;
 
@@ -36,10 +36,8 @@ class SIOptimizeExecMaskingPreRA {
   LiveIntervals *LIS;
   const AMDGPU::LaneMaskConstants &LMC;
 
-  MCRegister CondReg;
   MCRegister ExecReg;
 
-  bool optimizeVcndVcmpPair(MachineBasicBlock &MBB);
   bool optimizeElseBranch(MachineBasicBlock &MBB);
 
 public:
@@ -88,193 +86,6 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
   return new SIOptimizeExecMaskingPreRALegacy();
 }
 
-// See if there is a def between \p AndIdx and \p SelIdx that needs to live
-// beyond \p AndIdx.
-static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx,
-                         SlotIndex SelIdx) {
-  LiveQueryResult AndLRQ = LR.Query(AndIdx);
-  return (!AndLRQ.isKill() && AndLRQ.valueIn() != LR.Query(SelIdx).valueOut());
-}
-
-// FIXME: Why do we bother trying to handle physical registers here?
-static bool isDefBetween(const SIRegisterInfo &TRI,
-                         LiveIntervals *LIS, Register Reg,
-                         const MachineInstr &Sel, const MachineInstr &And) {
-  SlotIndex AndIdx = LIS->getInstructionIndex(And).getRegSlot();
-  SlotIndex SelIdx = LIS->getInstructionIndex(Sel).getRegSlot();
-
-  if (Reg.isVirtual())
-    return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx);
-
-  for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) {
-    if (isDefBetween(LIS->getRegUnit(Unit), AndIdx, SelIdx))
-      return true;
-  }
-
-  return false;
-}
-
-// Optimize sequence
-//    %sel = V_CNDMASK_B32_e64 0, 1, %cc
-//    %cmp = V_CMP_NE_U32 1, %sel
-//    $vcc = S_AND_B64 $exec, %cmp
-//    S_CBRANCH_VCC[N]Z
-// =>
-//    $vcc = S_ANDN2_B64 $exec, %cc
-//    S_CBRANCH_VCC[N]Z
-//
-// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
-// rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
-// only 3 first instructions are really needed. S_AND_B64 with exec is a
-// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
-// lanes.
-//
-// Returns true on success.
-bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
-  auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
-                           unsigned Opc = MI.getOpcode();
-                           return Opc == AMDGPU::S_CBRANCH_VCCZ ||
-                                  Opc == AMDGPU::S_CBRANCH_VCCNZ; });
-  if (I == MBB.terminators().end())
-    return false;
-
-  auto *And =
-      TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS);
-  if (!And || And->getOpcode() != LMC.AndOpc || !And->getOperand(1).isReg() ||
-      !And->getOperand(2).isReg())
-    return false;
-
-  MachineOperand *AndCC = &And->getOperand(1);
-  Register CmpReg = AndCC->getReg();
-  unsigned CmpSubReg = AndCC->getSubReg();
-  if (CmpReg == Register(ExecReg)) {
-    AndCC = &And->getOperand(2);
-    CmpReg = AndCC->getReg();
-    CmpSubReg = AndCC->getSubReg();
-  } else if (And->getOperand(2).getReg() != Register(ExecReg)) {
-    return false;
-  }
-
-  auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, *MRI, LIS);
-  if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
-                Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
-      Cmp->getParent() != And->getParent())
-    return false;
-
-  MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
-  MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
-  if (Op1->isImm() && Op2->isReg())
-    std::swap(Op1, Op2);
-  if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
-    return false;
-
-  Register SelReg = Op1->getReg();
-  if (SelReg.isPhysical())
-    return false;
-
-  auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS);
-  if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
-    return false;
-
-  if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
-      TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
-    return false;
-
-  Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
-  Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
-  MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
-  if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
-      Op1->getImm() != 0 || Op2->getImm() != 1)
-    return false;
-
-  Register CCReg = CC->getReg();
-
-  // If there was a def between the select and the and, we would need to move it
-  // to fold this.
-  if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
-    return false;
-
-  // Cannot safely mirror live intervals with PHI nodes, so check for these
-  // before optimization.
-  SlotIndex SelIdx = LIS->getInstructionIndex(*Sel);
-  LiveInterval *SelLI = &LIS->getInterval(SelReg);
-  if (llvm::any_of(SelLI->vnis(),
-                    [](const VNInfo *VNI) {
-                      return VNI->isPHIDef();
-                    }))
-    return false;
-
-  // TODO: Guard against implicit def operands?
-  LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
-                    << *And);
-
-  MachineInstr *Andn2 =
-      BuildMI(MBB, *And, And->getDebugLoc(), TII->get(LMC.AndN2Opc),
-              And->getOperand(0).getReg())
-          .addReg(ExecReg)
-          .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
-  MachineOperand &AndSCC = And->getOperand(3);
-  assert(AndSCC.getReg() == AMDGPU::SCC);
-  MachineOperand &Andn2SCC = Andn2->getOperand(3);
-  assert(Andn2SCC.getReg() == AMDGPU::SCC);
-  Andn2SCC.setIsDead(AndSCC.isDead());
-
-  SlotIndex AndIdx = LIS->ReplaceMachineInstrInMaps(*And, *Andn2);
-  And->eraseFromParent();
-
-  LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
-
-  // Update live intervals for CCReg before potentially removing CmpReg/SelReg,
-  // and their associated liveness information.
-  SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
-  if (CCReg.isVirtual()) {
-    LiveInterval &CCLI = LIS->getInterval(CCReg);
-    auto CCQ = CCLI.Query(SelIdx.getRegSlot());
-    if (CCQ.valueIn()) {
-      LIS->removeInterval(CCReg);
-      LIS->createAndComputeVirtRegInterval(CCReg);
-    }
-  } else
-    LIS->removeAllRegUnitsForPhysReg(CCReg);
-
-  // Try to remove compare. Cmp value should not used in between of cmp
-  // and s_and_b64 if VCC or just unused if any other register.
-  LiveInterval *CmpLI = CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr;
-  if ((CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
-      (CmpReg == Register(CondReg) &&
-       std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
-                    [&](const MachineInstr &MI) {
-                      return MI.readsRegister(CondReg, TRI);
-                    }))) {
-    LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
-    if (CmpLI)
-      LIS->removeVRegDefAt(*CmpLI, CmpIdx.getRegSlot());
-    LIS->RemoveMachineInstrFromMaps(*Cmp);
-    Cmp->eraseFromParent();
-
-    // Try to remove v_cndmask_b32.
-    // Kill status must be checked before shrinking the live range.
-    bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill();
-    LIS->shrinkToUses(SelLI);
-    bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
-    if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) {
-      LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
-
-      LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
-      LIS->RemoveMachineInstrFromMaps(*Sel);
-      bool ShrinkSel = Sel->getOperand(0).readsReg();
-      Sel->eraseFromParent();
-      if (ShrinkSel) {
-        // The result of the V_CNDMASK was a subreg def which counted as a read
-        // from the other parts of the reg. Shrink their live ranges.
-        LIS->shrinkToUses(SelLI);
-      }
-    }
-  }
-
-  return true;
-}
-
 // Optimize sequence
 //    %dst = S_OR_SAVEEXEC %src
 //    ... instructions not modifying exec ...
@@ -368,7 +179,6 @@ bool SIOptimizeExecMaskingPreRALegacy::runOnMachineFunction(
 }
 
 bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) {
-  CondReg = MCRegister::from(LMC.VccReg);
   ExecReg = MCRegister::from(LMC.ExecReg);
 
   DenseSet<Register> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
@@ -381,13 +191,6 @@ bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) {
       Changed = true;
     }
 
-    if (optimizeVcndVcmpPair(MBB)) {
-      RecalcRegs.insert(AMDGPU::VCC_LO);
-      RecalcRegs.insert(AMDGPU::VCC_HI);
-      RecalcRegs.insert(AMDGPU::SCC);
-      Changed = true;
-    }
-
     // Try to remove unneeded instructions before s_endpgm.
     if (MBB.succ_empty()) {
       if (MBB.empty())