diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h index d0ed3ff660d9b..658231c31481c 100644 --- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h +++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h @@ -20,6 +20,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -58,6 +59,11 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate { /// Called before shrinking the live range of a virtual register. virtual void LRE_WillShrinkVirtReg(Register) {} + /// Called when a virtual register's LiveInterval is about to become empty. + /// This happens when removeVRegDefAt removes the last definition. + /// Implementations should unassign from LiveRegMatrix before the interval is cleared. + virtual void LRE_WillClearVirtReg(Register, LiveInterval &) {} + /// Called after cloning a virtual register. /// This is used for new registers representing connected components of Old. virtual void LRE_DidCloneVirtReg(Register New, Register Old) {} @@ -75,6 +81,11 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate { /// FirstNew - Index of the first register added to NewRegs. const unsigned FirstNew; + /// Track which virtual registers are new (created during this edit). + /// Used to avoid calling Matrix->unassign on registers that were never + /// added to LiveRegMatrix. + SmallSet NewVirtRegs; + /// DeadRemats - The saved instructions which have already been dead after /// rematerialization but not deleted yet -- to be done in postOptimization. SmallPtrSet *DeadRemats; @@ -142,6 +153,12 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate { bool empty() const { return size() == 0; } Register get(unsigned idx) const { return NewRegs[idx + FirstNew]; } + /// Check if a virtual register was created during this edit. + /// This is used to identify registers that were never added to LiveRegMatrix. + bool isNewVirtualRegister(Register VReg) const { + return NewVirtRegs.contains(VReg); + } + /// pop_back - It allows LiveRangeEdit users to drop new registers. /// The context is when an original def instruction of a register is /// dead after rematerialization, we still want to keep it for following diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index c3e0964594bd5..006a2a4bff0f3 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -86,6 +86,7 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate { const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; const MachineBlockFrequencyInfo &MBFI; + LiveRegMatrix *Matrix; InsertPointAnalysis IPA; @@ -129,16 +130,18 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate { public: HoistSpillHelper(const Spiller::RequiredAnalyses &Analyses, - MachineFunction &mf, VirtRegMap &vrm) + MachineFunction &mf, VirtRegMap &vrm, + LiveRegMatrix *matrix = nullptr) : MF(mf), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT), VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI), - IPA(LIS, mf.getNumBlockIDs()) {} + Matrix(matrix), IPA(LIS, mf.getNumBlockIDs()) {} void addToMergeableSpills(MachineInstr &Spill, int StackSlot, Register Original); bool rmFromMergeableSpills(MachineInstr &Spill, int StackSlot); void hoistAllSpills(); + void LRE_WillClearVirtReg(Register, LiveInterval &) override; void LRE_DidCloneVirtReg(Register, Register) override; }; @@ -191,7 +194,7 @@ class InlineSpiller : public Spiller { : MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM), MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()), TRI(*MF.getSubtarget().getRegisterInfo()), Matrix(Matrix), - HSpiller(Analyses, MF, VRM), VRAI(VRAI) {} + HSpiller(Analyses, MF, VRM, Matrix), VRAI(VRAI) {} void spill(LiveRangeEdit &, AllocationOrder *Order = nullptr) override; ArrayRef getSpilledRegs() override { return RegsToSpill; } @@ -1750,6 +1753,17 @@ void HoistSpillHelper::hoistAllSpills() { } } +/// Called when a LiveInterval is about to be cleared by removeVRegDefAt. +/// Unassign from LiveRegMatrix to prevent dangling pointers (fixes LLVM bug #48911). +void HoistSpillHelper::LRE_WillClearVirtReg(Register VirtReg, + LiveInterval &LI) { + // If this virtual register is assigned to a physical register, unassign it + // from LiveRegMatrix before the interval is cleared. Otherwise, LiveIntervalUnion + // will contain dangling pointers. + if (Matrix && VRM.hasPhys(VirtReg)) + Matrix->unassign(LI); +} + /// For VirtReg clone, the \p New register should have the same physreg or /// stackslot as the \p old register. void HoistSpillHelper::LRE_DidCloneVirtReg(Register New, Register Old) { diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp index 5b0365da4e8c6..fbf5da3c58366 100644 --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -268,6 +268,11 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) { if (MO.isDef()) { if (TheDelegate && LI.getVNInfoAt(Idx) != nullptr) TheDelegate->LRE_WillShrinkVirtReg(LI.reg()); + // Notify delegate BEFORE clearing if this will make the interval empty. + // This allows Matrix->unassign to work with non-empty LiveRange. + // Skip new registers (clones) - they were never added to LiveRegMatrix. + if (TheDelegate && LI.size() == 1 && !isNewVirtualRegister(Reg)) + TheDelegate->LRE_WillClearVirtReg(Reg, LI); LIS.removeVRegDefAt(LI, Idx); if (LI.empty()) RegsToErase.push_back(Reg); @@ -398,6 +403,7 @@ LiveRangeEdit::MRI_NoteNewVirtualRegister(Register VReg) { VRM->grow(); NewRegs.push_back(VReg); + NewVirtRegs.insert(VReg); } void LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 8fe324728405f..0784098f3326b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -545,6 +545,9 @@ extern char &AMDGPUExportKernelRuntimeHandlesLegacyID; void initializeGCNNSAReassignLegacyPass(PassRegistry &); extern char &GCNNSAReassignID; +void initializeAMDGPUHotBlockRegisterRenamingLegacyPass(PassRegistry &); +extern char &AMDGPUHotBlockRegisterRenamingID; + void initializeGCNPreRALongBranchRegLegacyPass(PassRegistry &); extern char &GCNPreRALongBranchRegID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp new file mode 100644 index 0000000000000..c4c16c56f17c8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp @@ -0,0 +1,632 @@ +//===-- AMDGPUHotBlockRegisterRenaming.cpp -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Reduces value density in hot basic blocks by remapping local values +/// from overused physical registers to free physical registers. +/// +/// This gives the Post-RA scheduler more flexibility to reorder instructions +/// by reducing false dependencies created by register reuse. +/// +/// Algorithm: +/// 1. Sort basic blocks by frequency (hottest first) +/// 2. For each BB: +/// a. Calculate value density (count of local values per PhysReg) +/// b. Identify free PhysRegs (completely unused in this BB) +/// c. Iteratively move local values from dense to free registers +/// 3. VirtRegRewriter applies the updated VirtRegMap +/// +/// Constraints (conservative): +/// - Only move 32-bit VGPRs +/// - Only move local values (single segment, entirely within BB) +/// - Only move to completely free registers +/// - Skip values with allocation hints +/// - Skip reserved registers +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPUHotBlockRegisterRenaming.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-hot-block-reg-renaming" + +STATISTIC(NumBlocksProcessed, "Number of hot blocks processed"); +STATISTIC(NumValuesRemapped, "Number of values remapped to reduce density"); +STATISTIC(NumBlocksSkipped, + "Number of blocks skipped (no dense regs or no free regs)"); +STATISTIC(NumNonKernelsSkipped, + "Number of non-kernel functions skipped for safety"); + +namespace { + +class AMDGPUHotBlockRegisterRenamingImpl { +public: + AMDGPUHotBlockRegisterRenamingImpl(VirtRegMap *VRM, LiveRegMatrix *LRM, + LiveIntervals *LIS, + MachineBlockFrequencyInfo *MBFI, + const GCNSubtarget *ST, + const SIMachineFunctionInfo &MFI) + : VRM(VRM), LRM(LRM), LIS(LIS), MBFI(MBFI), ST(ST), MFI(MFI) {} + + bool run(MachineFunction &MF); + +private: + VirtRegMap *VRM; + LiveRegMatrix *LRM; + LiveIntervals *LIS; + MachineBlockFrequencyInfo *MBFI; + const GCNSubtarget *ST; + const SIMachineFunctionInfo &MFI; + const SIRegisterInfo *TRI = nullptr; + MachineRegisterInfo *MRI = nullptr; + unsigned VGPRLimit = 0; // Register limit based on occupancy + + /// Cache of VirtRegs that cannot be moved (e.g. tied operands) + DenseSet UnmovableVRegs; + + /// Process a single basic block + bool processBasicBlock(MachineBasicBlock *MBB); + + /// Calculate value density map for a basic block + void calculateValueDensity(MachineBasicBlock *MBB, + DenseMap &ValueDensity); + + /// Find free physical registers in a basic block + void findFreeRegisters(MachineBasicBlock *MBB, + SmallVectorImpl &FreeRegs); + + /// Check if a segment is local to a basic block + bool isLocalSegment(const LiveInterval::Segment &Seg, SlotIndex BBStart, + SlotIndex BBEnd) const; + + /// Check if a register is suitable for our optimization + bool isSuitableRegister(MCRegister PhysReg) const; + + /// Check if a virtual register can be safely moved + bool isVirtRegMovable(Register VirtReg, MCRegister CurrentPhysReg, + MCRegister TargetPhysReg, SlotIndex BBStart, + SlotIndex BBEnd); + + /// Try to move a value from DenseReg to FreeReg + bool tryMoveValue(MCRegister DenseReg, MCRegister FreeReg, + MachineBasicBlock *MBB, SlotIndex BBStart, SlotIndex BBEnd, + const DenseMap> &PhysRegDefs); +}; + +class AMDGPUHotBlockRegisterRenamingLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUHotBlockRegisterRenamingLegacy() : MachineFunctionPass(ID) { + initializeAMDGPUHotBlockRegisterRenamingLegacyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "AMDGPU Hot Block Register Renaming"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char AMDGPUHotBlockRegisterRenamingLegacy::ID = 0; + +char &llvm::AMDGPUHotBlockRegisterRenamingID = + AMDGPUHotBlockRegisterRenamingLegacy::ID; + +INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRegisterRenamingLegacy, DEBUG_TYPE, + "AMDGPU Hot Block Register Renaming", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) +INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_END(AMDGPUHotBlockRegisterRenamingLegacy, DEBUG_TYPE, + "AMDGPU Hot Block Register Renaming", false, false) + +bool AMDGPUHotBlockRegisterRenamingLegacy::runOnMachineFunction( + MachineFunction &MF) { + VirtRegMap *VRM = &getAnalysis().getVRM(); + LiveRegMatrix *LRM = &getAnalysis().getLRM(); + LiveIntervals *LIS = &getAnalysis().getLIS(); + MachineBlockFrequencyInfo *MBFI = + &getAnalysis().getMBFI(); + + const GCNSubtarget *ST = &MF.getSubtarget(); + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + + AMDGPUHotBlockRegisterRenamingImpl Impl(VRM, LRM, LIS, MBFI, ST, MFI); + return Impl.run(MF); +} + +bool AMDGPUHotBlockRegisterRenamingImpl::run(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "AMDGPUHotBlockRegisterRenaming: Processing " + << MF.getName() << "\n"); + + // Fix #0: Skip non-kernel functions to avoid RegMask corruption issues. + // Post-RA pass cannot update RegMask operands in caller's call instructions, + // which would lead to incorrect assumptions about clobbered registers. + CallingConv::ID CC = MF.getFunction().getCallingConv(); + if (CC != CallingConv::AMDGPU_KERNEL) { + LLVM_DEBUG(dbgs() << " Skipping non-kernel function (CC=" << CC + << "): Post-RA pass cannot safely modify callees\n"); + ++NumNonKernelsSkipped; + return false; + } + + TRI = ST->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + // Calculate VGPR limit based on occupancy + unsigned Occupancy = MFI.getOccupancy(); + VGPRLimit = ST->getMaxNumVGPRs(Occupancy, MFI.getDynamicVGPRBlockSize()); + + LLVM_DEBUG(dbgs() << " Occupancy: " << Occupancy + << ", VGPR Limit: " << VGPRLimit << "\n"); + + // Sort basic blocks by frequency (hottest first) + SmallVector SortedBBs; + for (MachineBasicBlock &MBB : MF) { + SortedBBs.push_back(&MBB); + } + + llvm::sort(SortedBBs, [this](MachineBasicBlock *A, MachineBasicBlock *B) { + return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B); + }); + + bool Changed = false; + for (MachineBasicBlock *MBB : SortedBBs) { + Changed |= processBasicBlock(MBB); + } + + return Changed; +} + +bool AMDGPUHotBlockRegisterRenamingImpl::processBasicBlock( + MachineBasicBlock *MBB) { + LLVM_DEBUG(dbgs() << " Processing BB#" << MBB->getNumber() << " (freq=" + << MBFI->getBlockFreq(MBB).getFrequency() << ")\n"); + + // Clear the unmovable cache for each BB (tied operands are BB-specific) + UnmovableVRegs.clear(); + + SlotIndex BBStart = LIS->getMBBStartIdx(MBB); + SlotIndex BBEnd = LIS->getMBBEndIdx(MBB); + + // Step 1: Calculate value density + DenseMap ValueDensity; + calculateValueDensity(MBB, ValueDensity); + + if (ValueDensity.empty()) { + LLVM_DEBUG(dbgs() << " No values found, skipping\n"); + ++NumBlocksSkipped; + return false; + } + + // Step 2: Find free registers + SmallVector FreeRegs; + findFreeRegisters(MBB, FreeRegs); + + if (FreeRegs.empty()) { + LLVM_DEBUG(dbgs() << " No free registers, skipping\n"); + ++NumBlocksSkipped; + return false; + } + + LLVM_DEBUG(dbgs() << " Found " << ValueDensity.size() + << " registers with values, " << FreeRegs.size() + << " free registers\n"); + + // Step 2a: Build PhysReg definitions cache (Fix #1a) + // Track all SlotIndexes where each physical register is defined + const TargetRegisterClass *VGPR_32_RC = + TRI->getRegClass(AMDGPU::VGPR_32RegClassID); + DenseMap> PhysRegDefs; + + for (MachineInstr &MI : *MBB) { + SlotIndex Idx = LIS->getInstructionIndex(MI); + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg().isPhysical()) { + MCRegister PhysReg = MO.getReg(); + if (VGPR_32_RC->contains(PhysReg)) { + PhysRegDefs[PhysReg].push_back(Idx); + // Also track superregs for aliasing + for (MCRegister Super : TRI->superregs(PhysReg)) { + PhysRegDefs[Super].push_back(Idx); + } + } + } + } + } + + LLVM_DEBUG({ + dbgs() << " Built PhysRegDefs cache: " << PhysRegDefs.size() + << " registers have definitions in this BB\n"; + }); + + // Step 3: Create max heap of dense registers + auto Comparator = [&ValueDensity](MCRegister A, MCRegister B) { + return ValueDensity[A] < ValueDensity[B]; // max heap + }; + std::priority_queue, decltype(Comparator)> + DenseRegsHeap(Comparator); + + for (auto &Entry : ValueDensity) { + if (Entry.second > 1) { // Only interested in registers with density > 1 + DenseRegsHeap.push(Entry.first); + } + } + + if (DenseRegsHeap.empty()) { + LLVM_DEBUG( + dbgs() << " No dense registers (all density <= 1), skipping\n"); + ++NumBlocksSkipped; + return false; + } + + // Step 4: Iteratively move values + bool Changed = false; + size_t FreeRegIdx = 0; + + while (!DenseRegsHeap.empty() && FreeRegIdx < FreeRegs.size()) { + MCRegister DenseReg = DenseRegsHeap.top(); + DenseRegsHeap.pop(); + + MCRegister FreeReg = FreeRegs[FreeRegIdx++]; + + if (tryMoveValue(DenseReg, FreeReg, MBB, BBStart, BBEnd, PhysRegDefs)) { + Changed = true; + ++NumValuesRemapped; + + // Update density + ValueDensity[DenseReg]--; + + // If still dense, put back in heap + if (ValueDensity[DenseReg] > 1) { + DenseRegsHeap.push(DenseReg); + } + } + } + + if (Changed) { + ++NumBlocksProcessed; + } else { + ++NumBlocksSkipped; + } + + return Changed; +} + +void AMDGPUHotBlockRegisterRenamingImpl::calculateValueDensity( + MachineBasicBlock *MBB, DenseMap &ValueDensity) { + SlotIndex BBStart = LIS->getMBBStartIdx(MBB); + SlotIndex BBEnd = LIS->getMBBEndIdx(MBB); + + // Iterate over VGPR_32 register class + const TargetRegisterClass *VGPR_32_RC = + TRI->getRegClass(AMDGPU::VGPR_32RegClassID); + + for (MCRegister PhysReg : *VGPR_32_RC) { + if (MRI->isReserved(PhysReg)) + continue; + + unsigned LocalValueCount = 0; + + // Access LiveIntervalUnion for this PhysReg + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit]; + + for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) { + SlotIndex SegStart = SI.start(); + SlotIndex SegEnd = SI.stop(); + + // Check if segment is entirely within this BB + if (SegStart >= BBStart && SegEnd < BBEnd) { + LocalValueCount++; + } + } + } + + if (LocalValueCount > 0) { + ValueDensity[PhysReg] = LocalValueCount; + } + } +} + +void AMDGPUHotBlockRegisterRenamingImpl::findFreeRegisters( + MachineBasicBlock *MBB, SmallVectorImpl &FreeRegs) { + SlotIndex BBStart = LIS->getMBBStartIdx(MBB); + SlotIndex BBEnd = LIS->getMBBEndIdx(MBB); + + const TargetRegisterClass *VGPR_32_RC = + TRI->getRegClass(AMDGPU::VGPR_32RegClassID); + + unsigned RegIdx = 0; + for (MCRegister PhysReg : *VGPR_32_RC) { + // Only consider registers up to VGPRLimit (based on occupancy) + if (RegIdx >= VGPRLimit) + break; + RegIdx++; + + if (MRI->isReserved(PhysReg)) + continue; + + bool IsFree = true; + + // Check all register units + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit]; + + // Check if anything is live in this BB + LiveIntervalUnion::SegmentIter SI = LIU.find(BBStart); + if (SI.valid() && SI.start() < BBEnd) { + IsFree = false; + break; + } + } + + if (IsFree) { + FreeRegs.push_back(PhysReg); + } + } +} + +bool AMDGPUHotBlockRegisterRenamingImpl::isVirtRegMovable(Register VirtReg, + MCRegister CurrentPhysReg, + MCRegister TargetPhysReg, + SlotIndex BBStart, + SlotIndex BBEnd) { + + LiveInterval &VirtRegLI = LIS->getInterval(VirtReg); + + // Verify precondition: single value with single segment in BB + unsigned SegmentCount = 0; + for (const LiveRange::Segment &S : VirtRegLI) { + if (S.start >= BBStart && S.end <= BBEnd) + SegmentCount++; + } + + // Cannot move registers with multiple segments in BB (e.g., PHI nodes) + if (SegmentCount != 1) { + LLVM_DEBUG(dbgs() << " Cannot move " << printReg(VirtReg, TRI) + << ": has " << SegmentCount << " segments in BB\n"); + return false; + } + + // Cannot move registers with multiple definitions (e.g., from PHI merge) + if (VirtRegLI.getNumValNums() != 1) { + LLVM_DEBUG(dbgs() << " Cannot move " << printReg(VirtReg, TRI) + << ": has " << VirtRegLI.getNumValNums() + << " value definitions\n"); + return false; + } + + // Check for tied operands + // A tied operand means the instruction requires source and destination to be + // the same physical register. Moving such a value would break this + // constraint. + + for (const LiveRange::Segment &S : VirtRegLI) { + // Only check segments within this BB + if (S.start < BBStart || S.end > BBEnd) + continue; + + // Check if this segment starts at a tied def point + // (meaning it's the destination of a tied operand instruction) + MachineInstr *DefMI = LIS->getInstructionFromIndex(S.start); + if (!DefMI) + continue; + + for (unsigned OpIdx = 0, E = DefMI->getNumOperands(); OpIdx < E; ++OpIdx) { + const MachineOperand &MO = DefMI->getOperand(OpIdx); + if (MO.isReg() && MO.getReg() == VirtReg && MO.isDef() && MO.isTied()) { + // Found a tied def - need to check the source operand it's tied to + unsigned TiedIdx = DefMI->findTiedOperandIdx(OpIdx); + const MachineOperand &TiedMO = DefMI->getOperand(TiedIdx); + + // If the tied source is a register, verify it won't conflict + if (TiedMO.isReg()) { + Register TiedReg = TiedMO.getReg(); + if (TiedReg.isVirtual()) { + MCRegister TiedPhysReg = VRM->getPhys(TiedReg); + // Cannot move if it would violate the tied constraint + // (source and dest must be in same physical register) + if (TiedPhysReg != CurrentPhysReg) { + LLVM_DEBUG(dbgs() << " Cannot move " << printReg(VirtReg, TRI) + << ": tied to " << printReg(TiedReg, TRI) + << " which is in different PhysReg " + << printReg(TiedPhysReg, TRI) << " at " << S.start + << " in " << *DefMI); + return false; + } + } + } + + LLVM_DEBUG(dbgs() << " Cannot move " << printReg(VirtReg, TRI) + << ": has tied def at " << S.start << " in " + << *DefMI); + return false; + } + } + } + + // Future checks can be added here: + // - Register class constraints + // - Special register restrictions + // - Architecture-specific constraints + + return true; +} + +bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue( + MCRegister DenseReg, MCRegister FreeReg, MachineBasicBlock *MBB, + SlotIndex BBStart, SlotIndex BBEnd, + const DenseMap> &PhysRegDefs) { + // Find a movable local value in DenseReg + for (MCRegUnit Unit : TRI->regunits(DenseReg)) { + LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit]; + + for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) { + Register VirtReg = SI.value()->reg(); + + // Skip physical registers (LiveIntervalUnion can contain both) + if (!VirtReg.isVirtual()) + continue; + + // Skip virtual registers that haven't been allocated yet + if (!VRM->hasPhys(VirtReg)) + continue; + + // Check if this VirtReg is mapped to DenseReg + // NOTE: This is NOT redundant! We iterate per register unit, and units + // can be shared between aliased registers (e.g., VGPR0 and VGPR0_VGPR1). + // This check filters out VirtRegs mapped to aliased registers. + if (VRM->getPhys(VirtReg) != DenseReg) + continue; + + // Get the proper LiveInterval from LiveIntervals + LiveInterval &VirtRegLI = LIS->getInterval(VirtReg); + + // Check: segment is local (entirely within BB) + SlotIndex SegStart = SI.start(); + SlotIndex SegEnd = SI.stop(); + if (SegStart < BBStart || SegEnd >= BBEnd) + continue; + + // Check: LiveInterval has only one segment (conservative) + if (VirtRegLI.size() != 1) + continue; + + // Check: No subranges (conservative - avoid complex cases) + if (VirtRegLI.hasSubRanges()) + continue; + + // Check: No allocation hints + if (VRM->hasKnownPreference(VirtReg)) + continue; + + // Check: Cached unmovable VirtRegs + if (UnmovableVRegs.contains(VirtReg)) { + LLVM_DEBUG(dbgs() << " Skipping " << printReg(VirtReg, TRI) + << " (cached as unmovable)\n"); + continue; + } + + // Check: Can this value be safely moved? + if (!isVirtRegMovable(VirtReg, DenseReg, FreeReg, BBStart, BBEnd)) { + // Cache the result to avoid checking again + UnmovableVRegs.insert(VirtReg); + continue; + } + + // Fix #1a: Check that FreeReg is not redefined in VirtReg's live range + auto DefIt = PhysRegDefs.find(FreeReg); + if (DefIt != PhysRegDefs.end()) { + bool HasConflict = false; + for (SlotIndex DefIdx : DefIt->second) { + // Check if definition is strictly inside the live range (not at endpoints) + if (DefIdx > SegStart && DefIdx < SegEnd) { + LLVM_DEBUG(dbgs() << " Cannot move to " << printReg(FreeReg, TRI) + << ": redefined at " << DefIdx << " inside live range [" + << SegStart << ", " << SegEnd << ")\n"); + HasConflict = true; + break; + } + } + if (HasConflict) + continue; // Try next VirtReg + } + + // Fix #1b: Check that FreeReg is not clobbered by any call in the live range + BitVector UsableRegs; + if (LIS->checkRegMaskInterference(VirtRegLI, UsableRegs)) { + // checkRegMaskInterference returns true if LI crosses RegMask instructions + // UsableRegs now contains registers NOT clobbered by any RegMask + if (!UsableRegs.test(FreeReg)) { + LLVM_DEBUG(dbgs() << " Cannot move to " << printReg(FreeReg, TRI) + << ": clobbered by call RegMask in live range\n"); + continue; // Try next VirtReg + } + } + + // This VirtReg is movable! Perform the remap + LLVM_DEBUG(dbgs() << " Moving " << printReg(VirtReg, TRI) << " from " + << printReg(DenseReg, TRI) << " to " + << printReg(FreeReg, TRI) << "\n"); + + // Safety check: must be assigned before unassign + if (!VRM->hasPhys(VirtReg)) { + LLVM_DEBUG( + dbgs() << " WARNING: VirtReg not assigned, skipping\n"); + continue; + } + + LRM->unassign(VirtRegLI); // Remove from LiveRegMatrix + LRM->assign(VirtRegLI, + FreeReg); // Assign to new physreg (updates VirtRegMap too) + + // Sanity check: verify VirtReg is now mapped to FreeReg + assert(VRM->getPhys(VirtReg) == FreeReg && + "VirtRegMap not updated correctly"); + + return true; // Successfully moved one value + } + } + + return false; // No movable value found +} + +PreservedAnalyses +AMDGPUHotBlockRegisterRenamingPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + VirtRegMap *VRM = &MFAM.getResult(MF); + LiveRegMatrix *LRM = &MFAM.getResult(MF); + LiveIntervals *LIS = &MFAM.getResult(MF); + MachineBlockFrequencyInfo *MBFI = + &MFAM.getResult(MF); + + const GCNSubtarget *ST = &MF.getSubtarget(); + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + + AMDGPUHotBlockRegisterRenamingImpl Impl(VRM, LRM, LIS, MBFI, ST, MFI); + if (!Impl.run(MF)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h new file mode 100644 index 0000000000000..6dfdd1bec72ef --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h @@ -0,0 +1,34 @@ +//===-- AMDGPUHotBlockRegisterRenaming.h -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Reduces value density in hot basic blocks by remapping local values +/// from overused physical registers to free physical registers. +/// +/// This gives the Post-RA scheduler more flexibility to reorder instructions +/// by reducing false dependencies created by register reuse. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class AMDGPUHotBlockRegisterRenamingPass + : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 5cd9bcf4a96fd..a299b239cd020 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -122,6 +122,7 @@ MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass() MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass()) MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass()) MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) +MACHINE_FUNCTION_PASS("amdgpu-hot-block-reg-renaming", AMDGPUHotBlockRegisterRenamingPass()) MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass()) MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass()) MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index a9a0a4a0cc3e4..d3c2e0642cf9c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -21,6 +21,7 @@ #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" #include "AMDGPUExportKernelRuntimeHandles.h" +#include "AMDGPUHotBlockRegisterRenaming.h" #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" #include "AMDGPULowerVGPREncoding.h" @@ -169,13 +170,13 @@ class AMDGPUCodeGenPassBuilder class SGPRRegisterRegAlloc : public RegisterRegAllocBase { public: SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) - : RegisterRegAllocBase(N, D, C) {} + : RegisterRegAllocBase(N, D, C) {} }; class VGPRRegisterRegAlloc : public RegisterRegAllocBase { public: VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) - : RegisterRegAllocBase(N, D, C) {} + : RegisterRegAllocBase(N, D, C) {} }; class WWMRegisterRegAlloc : public RegisterRegAllocBase { @@ -218,19 +219,21 @@ static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag; static SGPRRegisterRegAlloc -defaultSGPRRegAlloc("default", - "pick SGPR register allocator based on -O option", - useDefaultRegisterAllocator); + defaultSGPRRegAlloc("default", + "pick SGPR register allocator based on -O option", + useDefaultRegisterAllocator); static cl::opt> -SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), - cl::desc("Register allocator to use for SGPRs")); + SGPRRegAlloc("sgpr-regalloc", cl::Hidden, + cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for SGPRs")); static cl::opt> -VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), - cl::desc("Register allocator to use for VGPRs")); + VGPRRegAlloc("vgpr-regalloc", cl::Hidden, + cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for VGPRs")); static cl::opt> @@ -301,22 +304,25 @@ static FunctionPass *createFastWWMRegisterAllocator() { return createFastRegisterAllocator(onlyAllocateWWMRegs, false); } -static SGPRRegisterRegAlloc basicRegAllocSGPR( - "basic", "basic register allocator", createBasicSGPRRegisterAllocator); -static SGPRRegisterRegAlloc greedyRegAllocSGPR( - "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); - -static SGPRRegisterRegAlloc fastRegAllocSGPR( - "fast", "fast register allocator", createFastSGPRRegisterAllocator); +static SGPRRegisterRegAlloc basicRegAllocSGPR("basic", + "basic register allocator", + createBasicSGPRRegisterAllocator); +static SGPRRegisterRegAlloc + greedyRegAllocSGPR("greedy", "greedy register allocator", + createGreedySGPRRegisterAllocator); +static SGPRRegisterRegAlloc fastRegAllocSGPR("fast", "fast register allocator", + createFastSGPRRegisterAllocator); -static VGPRRegisterRegAlloc basicRegAllocVGPR( - "basic", "basic register allocator", createBasicVGPRRegisterAllocator); -static VGPRRegisterRegAlloc greedyRegAllocVGPR( - "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); +static VGPRRegisterRegAlloc basicRegAllocVGPR("basic", + "basic register allocator", + createBasicVGPRRegisterAllocator); +static VGPRRegisterRegAlloc + greedyRegAllocVGPR("greedy", "greedy register allocator", + createGreedyVGPRRegisterAllocator); -static VGPRRegisterRegAlloc fastRegAllocVGPR( - "fast", "fast register allocator", createFastVGPRRegisterAllocator); +static VGPRRegisterRegAlloc fastRegAllocVGPR("fast", "fast register allocator", + createFastVGPRRegisterAllocator); static WWMRegisterRegAlloc basicRegAllocWWMReg("basic", "basic register allocator", @@ -334,14 +340,14 @@ static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { } // anonymous namespace static cl::opt -EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, - cl::desc("Run early if-conversion"), - cl::init(false)); + EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, + cl::desc("Run early if-conversion"), + cl::init(false)); static cl::opt -OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, - cl::desc("Run pre-RA exec mask optimizations"), - cl::init(true)); + OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, + cl::desc("Run pre-RA exec mask optimizations"), + cl::init(true)); static cl::opt LowerCtorDtor("amdgpu-lower-global-ctor-dtor", @@ -349,32 +355,27 @@ static cl::opt cl::init(true), cl::Hidden); // Option to disable vectorizer for tests. -static cl::opt EnableLoadStoreVectorizer( - "amdgpu-load-store-vectorizer", - cl::desc("Enable load store vectorizer"), - cl::init(true), - cl::Hidden); +static cl::opt + EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer", + cl::desc("Enable load store vectorizer"), + cl::init(true), cl::Hidden); // Option to control global loads scalarization -static cl::opt ScalarizeGlobal( - "amdgpu-scalarize-global-loads", - cl::desc("Enable global load scalarization"), - cl::init(true), - cl::Hidden); +static cl::opt + ScalarizeGlobal("amdgpu-scalarize-global-loads", + cl::desc("Enable global load scalarization"), + cl::init(true), cl::Hidden); // Option to run internalize pass. static cl::opt InternalizeSymbols( - "amdgpu-internalize-symbols", - cl::desc("Enable elimination of non-kernel functions and unused globals"), - cl::init(false), - cl::Hidden); + "amdgpu-internalize-symbols", + cl::desc("Enable elimination of non-kernel functions and unused globals"), + cl::init(false), cl::Hidden); // Option to inline all early. -static cl::opt EarlyInlineAll( - "amdgpu-early-inline-all", - cl::desc("Inline all functions early"), - cl::init(false), - cl::Hidden); +static cl::opt EarlyInlineAll("amdgpu-early-inline-all", + cl::desc("Inline all functions early"), + cl::init(false), cl::Hidden); static cl::opt RemoveIncompatibleFunctions( "amdgpu-enable-remove-incompatible-functions", cl::Hidden, @@ -382,39 +383,40 @@ static cl::opt RemoveIncompatibleFunctions( "use features not supported by the target GPU"), cl::init(true)); -static cl::opt EnableSDWAPeephole( - "amdgpu-sdwa-peephole", - cl::desc("Enable SDWA peepholer"), - cl::init(true)); +static cl::opt EnableSDWAPeephole("amdgpu-sdwa-peephole", + cl::desc("Enable SDWA peepholer"), + cl::init(true)); -static cl::opt EnableDPPCombine( - "amdgpu-dpp-combine", - cl::desc("Enable DPP combiner"), - cl::init(true)); +static cl::opt EnableDPPCombine("amdgpu-dpp-combine", + cl::desc("Enable DPP combiner"), + cl::init(true)); // Enable address space based alias analysis -static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, - cl::desc("Enable AMDGPU Alias Analysis"), - cl::init(true)); +static cl::opt + EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, + cl::desc("Enable AMDGPU Alias Analysis"), + cl::init(true)); // Enable lib calls simplifications -static cl::opt EnableLibCallSimplify( - "amdgpu-simplify-libcall", - cl::desc("Enable amdgpu library simplifications"), - cl::init(true), - cl::Hidden); +static cl::opt + EnableLibCallSimplify("amdgpu-simplify-libcall", + cl::desc("Enable amdgpu library simplifications"), + cl::init(true), cl::Hidden); static cl::opt EnableLowerKernelArguments( - "amdgpu-ir-lower-kernel-arguments", - cl::desc("Lower kernel argument loads in IR pass"), - cl::init(true), - cl::Hidden); + "amdgpu-ir-lower-kernel-arguments", + cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), + cl::Hidden); static cl::opt EnableRegReassign( - "amdgpu-reassign-regs", - cl::desc("Enable register reassign optimizations on gfx10+"), - cl::init(true), - cl::Hidden); + "amdgpu-reassign-regs", + cl::desc("Enable register reassign optimizations on gfx10+"), + cl::init(true), cl::Hidden); + +static cl::opt EnableHotBlockRegRenaming( + "amdgpu-enable-hot-block-reg-renaming", + cl::desc("Enable hot block register renaming to reduce value density"), + cl::init(true), cl::Hidden); static cl::opt OptVGPRLiveRange( "amdgpu-opt-vgpr-liverange", @@ -432,11 +434,10 @@ static cl::opt AMDGPUAtomicOptimizerStrategy( clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer"))); // Enable Mode register optimization -static cl::opt EnableSIModeRegisterPass( - "amdgpu-mode-register", - cl::desc("Enable mode register pass"), - cl::init(true), - cl::Hidden); +static cl::opt + EnableSIModeRegisterPass("amdgpu-mode-register", + cl::desc("Enable mode register pass"), + cl::init(true), cl::Hidden); // Enable GFX11+ s_delay_alu insertion static cl::opt @@ -452,19 +453,16 @@ static cl::opt // Option is used in lit tests to prevent deadcoding of patterns inspected. static cl::opt -EnableDCEInRA("amdgpu-dce-in-ra", - cl::init(true), cl::Hidden, - cl::desc("Enable machine DCE inside regalloc")); + EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, + cl::desc("Enable machine DCE inside regalloc")); static cl::opt EnableSetWavePriority("amdgpu-set-wave-priority", cl::desc("Adjust wave priority"), cl::init(false), cl::Hidden); -static cl::opt EnableScalarIRPasses( - "amdgpu-scalar-ir-passes", - cl::desc("Enable scalar IR passes"), - cl::init(true), - cl::Hidden); +static cl::opt EnableScalarIRPasses("amdgpu-scalar-ir-passes", + cl::desc("Enable scalar IR passes"), + cl::init(true), cl::Hidden); static cl::opt EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", @@ -477,10 +475,10 @@ static cl::opt EnableLowerModuleLDS( cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), cl::Hidden); -static cl::opt EnablePreRAOptimizations( - "amdgpu-enable-pre-ra-optimizations", - cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), - cl::Hidden); +static cl::opt + EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations", + cl::desc("Enable Pre-RA optimizations pass"), + cl::init(true), cl::Hidden); static cl::opt EnablePromoteKernelArguments( "amdgpu-enable-promote-kernel-arguments", @@ -507,10 +505,10 @@ static cl::opt EnableRewritePartialRegUses( cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), cl::Hidden); -static cl::opt EnableHipStdPar( - "amdgpu-enable-hipstdpar", - cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false), - cl::Hidden); +static cl::opt + EnableHipStdPar("amdgpu-enable-hipstdpar", + cl::desc("Enable HIP Standard Parallelism Offload support"), + cl::init(false), cl::Hidden); static cl::opt EnableAMDGPUAttributor("amdgpu-attributor-enable", @@ -613,6 +611,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR); initializeGCNNSAReassignLegacyPass(*PR); + initializeAMDGPUHotBlockRegisterRenamingLegacyPass(*PR); initializeGCNPreRAOptimizationsLegacyPass(*PR); initializeGCNPreRALongBranchRegLegacyPass(*PR); initializeGCNRewritePartialRegUsesLegacyPass(*PR); @@ -634,8 +633,8 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { const GCNSubtarget &ST = C->MF->getSubtarget(); - ScheduleDAGMILive *DAG = - new GCNScheduleDAGMILive(C, std::make_unique(C)); + ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive( + C, std::make_unique(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); @@ -698,14 +697,13 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) { return DAG; } -static MachineSchedRegistry -SISchedRegistry("si", "Run SI's custom scheduler", - createSIMachineScheduler); +static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", + createSIMachineScheduler); static MachineSchedRegistry -GCNMaxOccupancySchedRegistry("gcn-max-occupancy", - "Run GCN scheduler to maximize occupancy", - createGCNMaxOccupancyMachineScheduler); + GCNMaxOccupancySchedRegistry("gcn-max-occupancy", + "Run GCN scheduler to maximize occupancy", + createGCNMaxOccupancyMachineScheduler); static MachineSchedRegistry GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", @@ -962,7 +960,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { - // Promote kernel arguments to global address space for LLVM IR // generated by flang compiler FunctionPassManager FPM; @@ -1395,7 +1392,7 @@ void AMDGPUPassConfig::addIRPasses() { AAResults &AAR) { if (auto *WrapperPass = P.getAnalysisIfAvailable()) AAR.addAAResult(WrapperPass->getResult()); - })); + })); } if (TM.getTargetTriple().isAMDGCN()) { @@ -1655,6 +1652,10 @@ void GCNPassConfig::addOptimizedRegAlloc() { } bool GCNPassConfig::addPreRewrite() { + // Hot block register renaming to reduce value density + if (TM->getOptLevel() > CodeGenOptLevel::None && EnableHotBlockRegRenaming) + addPass(&AMDGPUHotBlockRegisterRenamingID); + if (EnableRegReassign) addPass(&GCNNSAReassignID); @@ -2013,8 +2014,8 @@ bool GCNTargetMachine::parseMachineFunctionInfo( AMDGPU::SGPR_32RegClass, MFI->ArgInfo.PrivateSegmentSize, 0, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId, - AMDGPU::SGPR_32RegClass, - MFI->ArgInfo.LDSKernelId, 0, 1) || + AMDGPU::SGPR_32RegClass, MFI->ArgInfo.LDSKernelId, + 0, 1) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 0, 1) || @@ -2037,14 +2038,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo( AMDGPU::SReg_64RegClass, MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, - AMDGPU::VGPR_32RegClass, - MFI->ArgInfo.WorkItemIDX, 0, 0) || + AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDX, + 0, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, - AMDGPU::VGPR_32RegClass, - MFI->ArgInfo.WorkItemIDY, 0, 0) || + AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDY, + 0, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, - AMDGPU::VGPR_32RegClass, - MFI->ArgInfo.WorkItemIDZ, 0, 0))) + AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDZ, + 0, 0))) return true; if (ST.hasIEEEMode()) @@ -2251,6 +2252,11 @@ Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const { } void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const { + // Hot block register renaming to reduce value density + if (TM.getOptLevel() > CodeGenOptLevel::None && EnableHotBlockRegRenaming) { + addPass(AMDGPUHotBlockRegisterRenamingPass()); + } + if (EnableRegReassign) { addPass(GCNNSAReassignPass()); } @@ -2353,7 +2359,6 @@ Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized( // For allocating per-thread VGPRs. addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"})); - addPreRewrite(addPass); addPass(VirtRegRewriterPass(true)); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 8c0f9d01a7d30..b2d45438021ce 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp AMDGPUGlobalISelUtils.cpp + AMDGPUHotBlockRegisterRenaming.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp diff --git a/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir b/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir new file mode 100644 index 0000000000000..28c9c16f248d1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir @@ -0,0 +1,146 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-enable-hot-block-reg-renaming -verify-machineinstrs -run-pass=greedy,amdgpu-hot-block-reg-renaming,virtregrewriter -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @test_basic_move() { ret void } + define amdgpu_kernel void @test_tied_operand() { ret void } + define amdgpu_kernel void @test_no_free_registers() #0 { ret void } + + attributes #0 = { "amdgpu-num-vgpr"="8" } +... + +--- +# Test 1: Verify that we correctly move a value when it's safe to do so +# Multiple values allocated to vgpr3, one should be moved to a free register +# CHECK-LABEL: name: test_basic_move +# CHECK: bb.1: +# CHECK-NOT: renamable $vgpr3 = V_ADD_F32_e64 0, renamable $vgpr0, 0, renamable $vgpr1 +# CHECK-NOT: renamable $vgpr3 = V_MUL_F32_e64 0, killed renamable $vgpr3, 0, renamable $vgpr2 +# CHECK: renamable $vgpr{{[4-9]|[1-5][0-9]}} = V_MUL_F32_e64 +name: test_basic_move +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +registers: + - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } + - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' } + - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' } + - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' } + - { id: 4, class: vgpr_32, preferred-register: '$vgpr3' } + - { id: 5, class: vgpr_32, preferred-register: '$vgpr3' } + - { id: 6, class: vgpr_32, preferred-register: '$vgpr3' } + - { id: 7, class: vgpr_32, preferred-register: '$vgpr3' } +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr0, $vgpr1, $vgpr2 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + + bb.1: + successors: %bb.1, %bb.2 + ; High value density: %3, %4, %5, %6, %7 all prefer vgpr3 + ; Some should be moved to free registers + %3:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec + %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %2, 0, 0, implicit $mode, implicit $exec + %5:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %2, 0, 0, implicit $mode, implicit $exec + %6:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %1, 0, 0, implicit $mode, implicit $exec + %7:vgpr_32 = V_FMA_F32_e64 0, %3, 0, %2, 0, %4, 0, 0, implicit $mode, implicit $exec + S_CBRANCH_EXECNZ %bb.1, implicit $exec + + bb.2: + S_ENDPGM 0, implicit %7 + +... +--- +# Test 2: Verify that we do NOT move values with tied operands +# V_MAC_F32 has tied def-use, should not be moved +# CHECK-LABEL: name: test_tied_operand +# CHECK: bb.1: +# CHECK: renamable $vgpr3 = V_ADD_F32_e32 +# CHECK-NEXT: renamable $vgpr3 = V_MAC_F32_e32 +# CHECK-NEXT: renamable $vgpr3 = V_MUL_F32_e64 +name: test_tied_operand +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +registers: + - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } + - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' } + - { id: 2, class: vgpr_32, preferred-register: '$vgpr3' } + - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' } + - { id: 4, class: vgpr_32, preferred-register: '$vgpr3' } +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + + bb.1: + successors: %bb.1, %bb.2 + ; %2 and %3 both prefer vgpr3, but %3 has tied operand + %2:vgpr_32 = V_ADD_F32_e32 %1, %0, implicit $mode, implicit $exec + %3:vgpr_32 = V_MAC_F32_e32 %0, %1, %2, implicit $mode, implicit $exec + ; The pass should NOT move %3 because it has a tied def operand + %4:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec + S_CBRANCH_EXECNZ %bb.1, implicit $exec + + bb.2: + S_ENDPGM 0, implicit %4 + +... +--- +# Test 3: Verify that we do NOT move when there are no free registers +# With only 8 VGPRs available and all in use, no moves should happen +# CHECK-LABEL: name: test_no_free_registers +# CHECK: bb.1: +# CHECK: renamable $vgpr3 = V_ADD_F32_e64 +# CHECK: renamable $vgpr3 = V_MUL_F32_e64 +name: test_no_free_registers +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +registers: + - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } + - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' } + - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' } + - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' } + - { id: 4, class: vgpr_32, preferred-register: '$vgpr4' } + - { id: 5, class: vgpr_32, preferred-register: '$vgpr5' } + - { id: 6, class: vgpr_32, preferred-register: '$vgpr6' } + - { id: 7, class: vgpr_32, preferred-register: '$vgpr7' } + - { id: 8, class: vgpr_32, preferred-register: '$vgpr3' } + - { id: 9, class: vgpr_32, preferred-register: '$vgpr3' } +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + ; Fill up vgpr2-7 to leave no free registers + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %4:vgpr_32 = V_MOV_B32_e32 2, implicit $exec + %5:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + %6:vgpr_32 = V_MOV_B32_e32 4, implicit $exec + %7:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + + bb.1: + successors: %bb.1, %bb.2 + ; With all registers occupied, pass should not move values + %8:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec + %9:vgpr_32 = V_MUL_F32_e64 0, %8, 0, %1, 0, 0, implicit $mode, implicit $exec + S_CBRANCH_EXECNZ %bb.1, implicit $exec + + bb.2: + S_ENDPGM 0, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %9 + +... +