diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
index d0ed3ff660d9b..658231c31481c 100644
--- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -58,6 +59,11 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate {
     /// Called before shrinking the live range of a virtual register.
     virtual void LRE_WillShrinkVirtReg(Register) {}
 
+    /// Called when a virtual register's LiveInterval is about to become empty.
+    /// This happens when removeVRegDefAt removes the last definition.
+    /// Implementations should unassign from LiveRegMatrix before the interval is cleared.
+    virtual void LRE_WillClearVirtReg(Register, LiveInterval &) {}
+
     /// Called after cloning a virtual register.
     /// This is used for new registers representing connected components of Old.
     virtual void LRE_DidCloneVirtReg(Register New, Register Old) {}
@@ -75,6 +81,11 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate {
   /// FirstNew - Index of the first register added to NewRegs.
   const unsigned FirstNew;
 
+  /// Track which virtual registers are new (created during this edit).
+  /// Used to avoid calling Matrix->unassign on registers that were never
+  /// added to LiveRegMatrix.
+  SmallSet<Register, 8> NewVirtRegs;
+
   /// DeadRemats - The saved instructions which have already been dead after
   /// rematerialization but not deleted yet -- to be done in postOptimization.
   SmallPtrSet<MachineInstr *, 32> *DeadRemats;
@@ -142,6 +153,12 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate {
   bool empty() const { return size() == 0; }
   Register get(unsigned idx) const { return NewRegs[idx + FirstNew]; }
 
+  /// Check if a virtual register was created during this edit.
+  /// This is used to identify registers that were never added to LiveRegMatrix.
+  bool isNewVirtualRegister(Register VReg) const {
+    return NewVirtRegs.contains(VReg);
+  }
+
   /// pop_back - It allows LiveRangeEdit users to drop new registers.
   /// The context is when an original def instruction of a register is
   /// dead after rematerialization, we still want to keep it for following
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index c3e0964594bd5..006a2a4bff0f3 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -86,6 +86,7 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
   const TargetInstrInfo &TII;
   const TargetRegisterInfo &TRI;
   const MachineBlockFrequencyInfo &MBFI;
+  LiveRegMatrix *Matrix;
 
   InsertPointAnalysis IPA;
 
@@ -129,16 +130,18 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
 
 public:
   HoistSpillHelper(const Spiller::RequiredAnalyses &Analyses,
-                   MachineFunction &mf, VirtRegMap &vrm)
+                   MachineFunction &mf, VirtRegMap &vrm,
+                   LiveRegMatrix *matrix = nullptr)
       : MF(mf), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT),
         VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
         TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI),
-        IPA(LIS, mf.getNumBlockIDs()) {}
+        Matrix(matrix), IPA(LIS, mf.getNumBlockIDs()) {}
 
   void addToMergeableSpills(MachineInstr &Spill, int StackSlot,
                             Register Original);
   bool rmFromMergeableSpills(MachineInstr &Spill, int StackSlot);
   void hoistAllSpills();
+  void LRE_WillClearVirtReg(Register, LiveInterval &) override;
   void LRE_DidCloneVirtReg(Register, Register) override;
 };
 
@@ -191,7 +194,7 @@ class InlineSpiller : public Spiller {
       : MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM),
         MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
         TRI(*MF.getSubtarget().getRegisterInfo()), Matrix(Matrix),
-        HSpiller(Analyses, MF, VRM), VRAI(VRAI) {}
+        HSpiller(Analyses, MF, VRM, Matrix), VRAI(VRAI) {}
 
   void spill(LiveRangeEdit &, AllocationOrder *Order = nullptr) override;
   ArrayRef<Register> getSpilledRegs() override { return RegsToSpill; }
@@ -1750,6 +1753,17 @@ void HoistSpillHelper::hoistAllSpills() {
   }
 }
 
+/// Called when a LiveInterval is about to be cleared by removeVRegDefAt.
+/// Unassign from LiveRegMatrix to prevent dangling pointers (fixes LLVM bug #48911).
+void HoistSpillHelper::LRE_WillClearVirtReg(Register VirtReg,
+                                             LiveInterval &LI) {
+  // If this virtual register is assigned to a physical register, unassign it
+  // from LiveRegMatrix before the interval is cleared. Otherwise, LiveIntervalUnion
+  // will contain dangling pointers.
+  if (Matrix && VRM.hasPhys(VirtReg))
+    Matrix->unassign(LI);
+}
+
 /// For VirtReg clone, the \p New register should have the same physreg or
 /// stackslot as the \p old register.
 void HoistSpillHelper::LRE_DidCloneVirtReg(Register New, Register Old) {
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 5b0365da4e8c6..fbf5da3c58366 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -268,6 +268,11 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
     if (MO.isDef()) {
       if (TheDelegate && LI.getVNInfoAt(Idx) != nullptr)
         TheDelegate->LRE_WillShrinkVirtReg(LI.reg());
+      // Notify delegate BEFORE clearing if this will make the interval empty.
+      // This allows Matrix->unassign to work with non-empty LiveRange.
+      // Skip new registers (clones) - they were never added to LiveRegMatrix.
+      if (TheDelegate && LI.size() == 1 && !isNewVirtualRegister(Reg))
+        TheDelegate->LRE_WillClearVirtReg(Reg, LI);
       LIS.removeVRegDefAt(LI, Idx);
       if (LI.empty())
         RegsToErase.push_back(Reg);
@@ -398,6 +403,7 @@ LiveRangeEdit::MRI_NoteNewVirtualRegister(Register VReg) {
     VRM->grow();
 
   NewRegs.push_back(VReg);
+  NewVirtRegs.insert(VReg);
 }
 
 void LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 8fe324728405f..0784098f3326b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -545,6 +545,9 @@ extern char &AMDGPUExportKernelRuntimeHandlesLegacyID;
 void initializeGCNNSAReassignLegacyPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+void initializeAMDGPUHotBlockRegisterRenamingLegacyPass(PassRegistry &);
+extern char &AMDGPUHotBlockRegisterRenamingID;
+
 void initializeGCNPreRALongBranchRegLegacyPass(PassRegistry &);
 extern char &GCNPreRALongBranchRegID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
new file mode 100644
index 0000000000000..c4c16c56f17c8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
@@ -0,0 +1,632 @@
+//===-- AMDGPUHotBlockRegisterRenaming.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Reduces value density in hot basic blocks by remapping local values
+/// from overused physical registers to free physical registers.
+///
+/// This gives the Post-RA scheduler more flexibility to reorder instructions
+/// by reducing false dependencies created by register reuse.
+///
+/// Algorithm:
+/// 1. Sort basic blocks by frequency (hottest first)
+/// 2. For each BB:
+///    a. Calculate value density (count of local values per PhysReg)
+///    b. Identify free PhysRegs (completely unused in this BB)
+///    c. Iteratively move local values from dense to free registers
+/// 3. VirtRegRewriter applies the updated VirtRegMap
+///
+/// Constraints (conservative):
+/// - Only move 32-bit VGPRs
+/// - Only move local values (single segment, entirely within BB)
+/// - Only move to completely free registers
+/// - Skip values with allocation hints
+/// - Skip reserved registers
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUHotBlockRegisterRenaming.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-hot-block-reg-renaming"
+
+STATISTIC(NumBlocksProcessed, "Number of hot blocks processed");
+STATISTIC(NumValuesRemapped, "Number of values remapped to reduce density");
+STATISTIC(NumBlocksSkipped,
+          "Number of blocks skipped (no dense regs or no free regs)");
+STATISTIC(NumNonKernelsSkipped,
+          "Number of non-kernel functions skipped for safety");
+
+namespace {
+
+class AMDGPUHotBlockRegisterRenamingImpl {
+public:
+  AMDGPUHotBlockRegisterRenamingImpl(VirtRegMap *VRM, LiveRegMatrix *LRM,
+                                     LiveIntervals *LIS,
+                                     MachineBlockFrequencyInfo *MBFI,
+                                     const GCNSubtarget *ST,
+                                     const SIMachineFunctionInfo &MFI)
+      : VRM(VRM), LRM(LRM), LIS(LIS), MBFI(MBFI), ST(ST), MFI(MFI) {}
+
+  bool run(MachineFunction &MF);
+
+private:
+  VirtRegMap *VRM;
+  LiveRegMatrix *LRM;
+  LiveIntervals *LIS;
+  MachineBlockFrequencyInfo *MBFI;
+  const GCNSubtarget *ST;
+  const SIMachineFunctionInfo &MFI;
+  const SIRegisterInfo *TRI = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  unsigned VGPRLimit = 0; // Register limit based on occupancy
+
+  /// Cache of VirtRegs that cannot be moved (e.g. tied operands)
+  DenseSet<Register> UnmovableVRegs;
+
+  /// Process a single basic block
+  bool processBasicBlock(MachineBasicBlock *MBB);
+
+  /// Calculate value density map for a basic block
+  void calculateValueDensity(MachineBasicBlock *MBB,
+                             DenseMap<MCRegister, unsigned> &ValueDensity);
+
+  /// Find free physical registers in a basic block
+  void findFreeRegisters(MachineBasicBlock *MBB,
+                         SmallVectorImpl<MCRegister> &FreeRegs);
+
+  /// Check if a segment is local to a basic block
+  bool isLocalSegment(const LiveInterval::Segment &Seg, SlotIndex BBStart,
+                      SlotIndex BBEnd) const;
+
+  /// Check if a register is suitable for our optimization
+  bool isSuitableRegister(MCRegister PhysReg) const;
+
+  /// Check if a virtual register can be safely moved
+  bool isVirtRegMovable(Register VirtReg, MCRegister CurrentPhysReg,
+                        MCRegister TargetPhysReg, SlotIndex BBStart,
+                        SlotIndex BBEnd);
+
+  /// Try to move a value from DenseReg to FreeReg
+  bool tryMoveValue(MCRegister DenseReg, MCRegister FreeReg,
+                    MachineBasicBlock *MBB, SlotIndex BBStart, SlotIndex BBEnd,
+                    const DenseMap<MCRegister, SmallVector<SlotIndex, 4>> &PhysRegDefs);
+};
+
+class AMDGPUHotBlockRegisterRenamingLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUHotBlockRegisterRenamingLegacy() : MachineFunctionPass(ID) {
+    initializeAMDGPUHotBlockRegisterRenamingLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Hot Block Register Renaming";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addRequired<VirtRegMapWrapperLegacy>();
+    AU.addRequired<LiveRegMatrixWrapperLegacy>();
+    AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char AMDGPUHotBlockRegisterRenamingLegacy::ID = 0;
+
+char &llvm::AMDGPUHotBlockRegisterRenamingID =
+    AMDGPUHotBlockRegisterRenamingLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRegisterRenamingLegacy, DEBUG_TYPE,
+                      "AMDGPU Hot Block Register Renaming", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUHotBlockRegisterRenamingLegacy, DEBUG_TYPE,
+                    "AMDGPU Hot Block Register Renaming", false, false)
+
+bool AMDGPUHotBlockRegisterRenamingLegacy::runOnMachineFunction(
+    MachineFunction &MF) {
+  VirtRegMap *VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
+  LiveRegMatrix *LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
+  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  MachineBlockFrequencyInfo *MBFI =
+      &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
+
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  AMDGPUHotBlockRegisterRenamingImpl Impl(VRM, LRM, LIS, MBFI, ST, MFI);
+  return Impl.run(MF);
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::run(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "AMDGPUHotBlockRegisterRenaming: Processing "
+                    << MF.getName() << "\n");
+
+  // Fix #0: Skip non-kernel functions to avoid RegMask corruption issues.
+  // Post-RA pass cannot update RegMask operands in caller's call instructions,
+  // which would lead to incorrect assumptions about clobbered registers.
+  CallingConv::ID CC = MF.getFunction().getCallingConv();
+  if (CC != CallingConv::AMDGPU_KERNEL) {
+    LLVM_DEBUG(dbgs() << "  Skipping non-kernel function (CC=" << CC
+                      << "): Post-RA pass cannot safely modify callees\n");
+    ++NumNonKernelsSkipped;
+    return false;
+  }
+
+  TRI = ST->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  // Calculate VGPR limit based on occupancy
+  unsigned Occupancy = MFI.getOccupancy();
+  VGPRLimit = ST->getMaxNumVGPRs(Occupancy, MFI.getDynamicVGPRBlockSize());
+
+  LLVM_DEBUG(dbgs() << "  Occupancy: " << Occupancy
+                    << ", VGPR Limit: " << VGPRLimit << "\n");
+
+  // Sort basic blocks by frequency (hottest first)
+  SmallVector<MachineBasicBlock *, 16> SortedBBs;
+  for (MachineBasicBlock &MBB : MF) {
+    SortedBBs.push_back(&MBB);
+  }
+
+  llvm::sort(SortedBBs, [this](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
+  });
+
+  bool Changed = false;
+  for (MachineBasicBlock *MBB : SortedBBs) {
+    Changed |= processBasicBlock(MBB);
+  }
+
+  return Changed;
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::processBasicBlock(
+    MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "  Processing BB#" << MBB->getNumber() << " (freq="
+                    << MBFI->getBlockFreq(MBB).getFrequency() << ")\n");
+
+  // Clear the unmovable cache for each BB (tied operands are BB-specific)
+  UnmovableVRegs.clear();
+
+  SlotIndex BBStart = LIS->getMBBStartIdx(MBB);
+  SlotIndex BBEnd = LIS->getMBBEndIdx(MBB);
+
+  // Step 1: Calculate value density
+  DenseMap<MCRegister, unsigned> ValueDensity;
+  calculateValueDensity(MBB, ValueDensity);
+
+  if (ValueDensity.empty()) {
+    LLVM_DEBUG(dbgs() << "    No values found, skipping\n");
+    ++NumBlocksSkipped;
+    return false;
+  }
+
+  // Step 2: Find free registers
+  SmallVector<MCRegister, 64> FreeRegs;
+  findFreeRegisters(MBB, FreeRegs);
+
+  if (FreeRegs.empty()) {
+    LLVM_DEBUG(dbgs() << "    No free registers, skipping\n");
+    ++NumBlocksSkipped;
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "    Found " << ValueDensity.size()
+                    << " registers with values, " << FreeRegs.size()
+                    << " free registers\n");
+
+  // Step 2a: Build PhysReg definitions cache (Fix #1a)
+  // Track all SlotIndexes where each physical register is defined
+  const TargetRegisterClass *VGPR_32_RC =
+      TRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+  DenseMap<MCRegister, SmallVector<SlotIndex, 4>> PhysRegDefs;
+  
+  for (MachineInstr &MI : *MBB) {
+    SlotIndex Idx = LIS->getInstructionIndex(MI);
+    for (const MachineOperand &MO : MI.operands()) {
+      if (MO.isReg() && MO.isDef() && MO.getReg().isPhysical()) {
+        MCRegister PhysReg = MO.getReg();
+        if (VGPR_32_RC->contains(PhysReg)) {
+          PhysRegDefs[PhysReg].push_back(Idx);
+          // Also track superregs for aliasing
+          for (MCRegister Super : TRI->superregs(PhysReg)) {
+            PhysRegDefs[Super].push_back(Idx);
+          }
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "    Built PhysRegDefs cache: " << PhysRegDefs.size() 
+           << " registers have definitions in this BB\n";
+  });
+
+  // Step 3: Create max heap of dense registers
+  auto Comparator = [&ValueDensity](MCRegister A, MCRegister B) {
+    return ValueDensity[A] < ValueDensity[B]; // max heap
+  };
+  std::priority_queue<MCRegister, std::vector<MCRegister>, decltype(Comparator)>
+      DenseRegsHeap(Comparator);
+
+  for (auto &Entry : ValueDensity) {
+    if (Entry.second > 1) { // Only interested in registers with density > 1
+      DenseRegsHeap.push(Entry.first);
+    }
+  }
+
+  if (DenseRegsHeap.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "    No dense registers (all density <= 1), skipping\n");
+    ++NumBlocksSkipped;
+    return false;
+  }
+
+  // Step 4: Iteratively move values
+  bool Changed = false;
+  size_t FreeRegIdx = 0;
+
+  while (!DenseRegsHeap.empty() && FreeRegIdx < FreeRegs.size()) {
+    MCRegister DenseReg = DenseRegsHeap.top();
+    DenseRegsHeap.pop();
+
+    MCRegister FreeReg = FreeRegs[FreeRegIdx++];
+
+    if (tryMoveValue(DenseReg, FreeReg, MBB, BBStart, BBEnd, PhysRegDefs)) {
+      Changed = true;
+      ++NumValuesRemapped;
+
+      // Update density
+      ValueDensity[DenseReg]--;
+
+      // If still dense, put back in heap
+      if (ValueDensity[DenseReg] > 1) {
+        DenseRegsHeap.push(DenseReg);
+      }
+    }
+  }
+
+  if (Changed) {
+    ++NumBlocksProcessed;
+  } else {
+    ++NumBlocksSkipped;
+  }
+
+  return Changed;
+}
+
+void AMDGPUHotBlockRegisterRenamingImpl::calculateValueDensity(
+    MachineBasicBlock *MBB, DenseMap<MCRegister, unsigned> &ValueDensity) {
+  SlotIndex BBStart = LIS->getMBBStartIdx(MBB);
+  SlotIndex BBEnd = LIS->getMBBEndIdx(MBB);
+
+  // Iterate over VGPR_32 register class
+  const TargetRegisterClass *VGPR_32_RC =
+      TRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+
+  for (MCRegister PhysReg : *VGPR_32_RC) {
+    if (MRI->isReserved(PhysReg))
+      continue;
+
+    unsigned LocalValueCount = 0;
+
+    // Access LiveIntervalUnion for this PhysReg
+    for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+      LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+
+      for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) {
+        SlotIndex SegStart = SI.start();
+        SlotIndex SegEnd = SI.stop();
+
+        // Check if segment is entirely within this BB
+        if (SegStart >= BBStart && SegEnd < BBEnd) {
+          LocalValueCount++;
+        }
+      }
+    }
+
+    if (LocalValueCount > 0) {
+      ValueDensity[PhysReg] = LocalValueCount;
+    }
+  }
+}
+
+void AMDGPUHotBlockRegisterRenamingImpl::findFreeRegisters(
+    MachineBasicBlock *MBB, SmallVectorImpl<MCRegister> &FreeRegs) {
+  SlotIndex BBStart = LIS->getMBBStartIdx(MBB);
+  SlotIndex BBEnd = LIS->getMBBEndIdx(MBB);
+
+  const TargetRegisterClass *VGPR_32_RC =
+      TRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+
+  unsigned RegIdx = 0;
+  for (MCRegister PhysReg : *VGPR_32_RC) {
+    // Only consider registers up to VGPRLimit (based on occupancy)
+    if (RegIdx >= VGPRLimit)
+      break;
+    RegIdx++;
+
+    if (MRI->isReserved(PhysReg))
+      continue;
+
+    bool IsFree = true;
+
+    // Check all register units
+    for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+      LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+
+      // Check if anything is live in this BB
+      LiveIntervalUnion::SegmentIter SI = LIU.find(BBStart);
+      if (SI.valid() && SI.start() < BBEnd) {
+        IsFree = false;
+        break;
+      }
+    }
+
+    if (IsFree) {
+      FreeRegs.push_back(PhysReg);
+    }
+  }
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::isVirtRegMovable(Register VirtReg,
+                                                          MCRegister CurrentPhysReg,
+                                                          MCRegister TargetPhysReg,
+                                                          SlotIndex BBStart,
+                                                          SlotIndex BBEnd) {
+
+  LiveInterval &VirtRegLI = LIS->getInterval(VirtReg);
+
+  // Verify precondition: single value with single segment in BB
+  unsigned SegmentCount = 0;
+  for (const LiveRange::Segment &S : VirtRegLI) {
+    if (S.start >= BBStart && S.end <= BBEnd)
+      SegmentCount++;
+  }
+  
+  // Cannot move registers with multiple segments in BB (e.g., PHI nodes)
+  if (SegmentCount != 1) {
+    LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                      << ": has " << SegmentCount << " segments in BB\n");
+    return false;
+  }
+  
+  // Cannot move registers with multiple definitions (e.g., from PHI merge)
+  if (VirtRegLI.getNumValNums() != 1) {
+    LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                      << ": has " << VirtRegLI.getNumValNums() 
+                      << " value definitions\n");
+    return false;
+  }
+
+  // Check for tied operands
+  // A tied operand means the instruction requires source and destination to be
+  // the same physical register. Moving such a value would break this
+  // constraint.
+
+  for (const LiveRange::Segment &S : VirtRegLI) {
+    // Only check segments within this BB
+    if (S.start < BBStart || S.end > BBEnd)
+      continue;
+
+    // Check if this segment starts at a tied def point
+    // (meaning it's the destination of a tied operand instruction)
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(S.start);
+    if (!DefMI)
+      continue;
+
+    for (unsigned OpIdx = 0, E = DefMI->getNumOperands(); OpIdx < E; ++OpIdx) {
+      const MachineOperand &MO = DefMI->getOperand(OpIdx);
+      if (MO.isReg() && MO.getReg() == VirtReg && MO.isDef() && MO.isTied()) {
+        // Found a tied def - need to check the source operand it's tied to
+        unsigned TiedIdx = DefMI->findTiedOperandIdx(OpIdx);
+        const MachineOperand &TiedMO = DefMI->getOperand(TiedIdx);
+        
+        // If the tied source is a register, verify it won't conflict
+        if (TiedMO.isReg()) {
+          Register TiedReg = TiedMO.getReg();
+          if (TiedReg.isVirtual()) {
+            MCRegister TiedPhysReg = VRM->getPhys(TiedReg);
+            // Cannot move if it would violate the tied constraint
+            // (source and dest must be in same physical register)
+            if (TiedPhysReg != CurrentPhysReg) {
+              LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                                << ": tied to " << printReg(TiedReg, TRI)
+                                << " which is in different PhysReg "
+                                << printReg(TiedPhysReg, TRI) << " at " << S.start
+                                << " in " << *DefMI);
+              return false;
+            }
+          }
+        }
+        
+        LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                          << ": has tied def at " << S.start << " in "
+                          << *DefMI);
+        return false;
+      }
+    }
+  }
+
+  // Future checks can be added here:
+  // - Register class constraints
+  // - Special register restrictions
+  // - Architecture-specific constraints
+
+  return true;
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(
+    MCRegister DenseReg, MCRegister FreeReg, MachineBasicBlock *MBB,
+    SlotIndex BBStart, SlotIndex BBEnd,
+    const DenseMap<MCRegister, SmallVector<SlotIndex, 4>> &PhysRegDefs) {
+  // Find a movable local value in DenseReg
+  for (MCRegUnit Unit : TRI->regunits(DenseReg)) {
+    LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+
+    for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) {
+      Register VirtReg = SI.value()->reg();
+
+      // Skip physical registers (LiveIntervalUnion can contain both)
+      if (!VirtReg.isVirtual())
+        continue;
+
+      // Skip virtual registers that haven't been allocated yet
+      if (!VRM->hasPhys(VirtReg))
+        continue;
+
+      // Check if this VirtReg is mapped to DenseReg
+      // NOTE: This is NOT redundant! We iterate per register unit, and units
+      // can be shared between aliased registers (e.g., VGPR0 and VGPR0_VGPR1).
+      // This check filters out VirtRegs mapped to aliased registers.
+      if (VRM->getPhys(VirtReg) != DenseReg)
+        continue;
+
+      // Get the proper LiveInterval from LiveIntervals
+      LiveInterval &VirtRegLI = LIS->getInterval(VirtReg);
+
+      // Check: segment is local (entirely within BB)
+      SlotIndex SegStart = SI.start();
+      SlotIndex SegEnd = SI.stop();
+      if (SegStart < BBStart || SegEnd >= BBEnd)
+        continue;
+
+      // Check: LiveInterval has only one segment (conservative)
+      if (VirtRegLI.size() != 1)
+        continue;
+
+      // Check: No subranges (conservative - avoid complex cases)
+      if (VirtRegLI.hasSubRanges())
+        continue;
+
+      // Check: No allocation hints
+      if (VRM->hasKnownPreference(VirtReg))
+        continue;
+
+      // Check: Cached unmovable VirtRegs
+      if (UnmovableVRegs.contains(VirtReg)) {
+        LLVM_DEBUG(dbgs() << "        Skipping " << printReg(VirtReg, TRI)
+                          << " (cached as unmovable)\n");
+        continue;
+      }
+
+      // Check: Can this value be safely moved?
+      if (!isVirtRegMovable(VirtReg, DenseReg, FreeReg, BBStart, BBEnd)) {
+        // Cache the result to avoid checking again
+        UnmovableVRegs.insert(VirtReg);
+        continue;
+      }
+
+      // Fix #1a: Check that FreeReg is not redefined in VirtReg's live range
+      auto DefIt = PhysRegDefs.find(FreeReg);
+      if (DefIt != PhysRegDefs.end()) {
+        bool HasConflict = false;
+        for (SlotIndex DefIdx : DefIt->second) {
+          // Check if definition is strictly inside the live range (not at endpoints)
+          if (DefIdx > SegStart && DefIdx < SegEnd) {
+            LLVM_DEBUG(dbgs() << "        Cannot move to " << printReg(FreeReg, TRI)
+                              << ": redefined at " << DefIdx << " inside live range ["
+                              << SegStart << ", " << SegEnd << ")\n");
+            HasConflict = true;
+            break;
+          }
+        }
+        if (HasConflict)
+          continue;  // Try next VirtReg
+      }
+
+      // Fix #1b: Check that FreeReg is not clobbered by any call in the live range
+      BitVector UsableRegs;
+      if (LIS->checkRegMaskInterference(VirtRegLI, UsableRegs)) {
+        // checkRegMaskInterference returns true if LI crosses RegMask instructions
+        // UsableRegs now contains registers NOT clobbered by any RegMask
+        if (!UsableRegs.test(FreeReg)) {
+          LLVM_DEBUG(dbgs() << "        Cannot move to " << printReg(FreeReg, TRI)
+                            << ": clobbered by call RegMask in live range\n");
+          continue;  // Try next VirtReg
+        }
+      }
+
+      // This VirtReg is movable! Perform the remap
+      LLVM_DEBUG(dbgs() << "      Moving " << printReg(VirtReg, TRI) << " from "
+                        << printReg(DenseReg, TRI) << " to "
+                        << printReg(FreeReg, TRI) << "\n");
+
+      // Safety check: must be assigned before unassign
+      if (!VRM->hasPhys(VirtReg)) {
+        LLVM_DEBUG(
+            dbgs() << "        WARNING: VirtReg not assigned, skipping\n");
+        continue;
+      }
+
+      LRM->unassign(VirtRegLI); // Remove from LiveRegMatrix
+      LRM->assign(VirtRegLI,
+                  FreeReg); // Assign to new physreg (updates VirtRegMap too)
+
+      // Sanity check: verify VirtReg is now mapped to FreeReg
+      assert(VRM->getPhys(VirtReg) == FreeReg &&
+             "VirtRegMap not updated correctly");
+
+      return true; // Successfully moved one value
+    }
+  }
+
+  return false; // No movable value found
+}
+
+PreservedAnalyses
+AMDGPUHotBlockRegisterRenamingPass::run(MachineFunction &MF,
+                                        MachineFunctionAnalysisManager &MFAM) {
+  VirtRegMap *VRM = &MFAM.getResult<VirtRegMapAnalysis>(MF);
+  LiveRegMatrix *LRM = &MFAM.getResult<LiveRegMatrixAnalysis>(MF);
+  LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  MachineBlockFrequencyInfo *MBFI =
+      &MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
+
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  AMDGPUHotBlockRegisterRenamingImpl Impl(VRM, LRM, LIS, MBFI, ST, MFI);
+  if (!Impl.run(MF))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h
new file mode 100644
index 0000000000000..6dfdd1bec72ef
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h
@@ -0,0 +1,34 @@
+//===-- AMDGPUHotBlockRegisterRenaming.h -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Reduces value density in hot basic blocks by remapping local values
+/// from overused physical registers to free physical registers.
+///
+/// This gives the Post-RA scheduler more flexibility to reorder instructions
+/// by reducing false dependencies created by register reuse.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class AMDGPUHotBlockRegisterRenamingPass
+    : public PassInfoMixin<AMDGPUHotBlockRegisterRenamingPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 5cd9bcf4a96fd..a299b239cd020 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -122,6 +122,7 @@ MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()
 MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
 MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
 MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
+MACHINE_FUNCTION_PASS("amdgpu-hot-block-reg-renaming", AMDGPUHotBlockRegisterRenamingPass())
 MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
 MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
 MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a9a0a4a0cc3e4..d3c2e0642cf9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -21,6 +21,7 @@
 #include "AMDGPUCtorDtorLowering.h"
 #include "AMDGPUExportClustering.h"
 #include "AMDGPUExportKernelRuntimeHandles.h"
+#include "AMDGPUHotBlockRegisterRenaming.h"
 #include "AMDGPUIGroupLP.h"
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPULowerVGPREncoding.h"
@@ -169,13 +170,13 @@ class AMDGPUCodeGenPassBuilder
 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
 public:
   SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
-    : RegisterRegAllocBase(N, D, C) {}
+      : RegisterRegAllocBase(N, D, C) {}
 };
 
 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
 public:
   VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
-    : RegisterRegAllocBase(N, D, C) {}
+      : RegisterRegAllocBase(N, D, C) {}
 };
 
 class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
@@ -218,19 +219,21 @@ static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
 static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
 
 static SGPRRegisterRegAlloc
-defaultSGPRRegAlloc("default",
-                    "pick SGPR register allocator based on -O option",
-                    useDefaultRegisterAllocator);
+    defaultSGPRRegAlloc("default",
+                        "pick SGPR register allocator based on -O option",
+                        useDefaultRegisterAllocator);
 
 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<SGPRRegisterRegAlloc>>
-SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
-             cl::desc("Register allocator to use for SGPRs"));
+    SGPRRegAlloc("sgpr-regalloc", cl::Hidden,
+                 cl::init(&useDefaultRegisterAllocator),
+                 cl::desc("Register allocator to use for SGPRs"));
 
 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<VGPRRegisterRegAlloc>>
-VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
-             cl::desc("Register allocator to use for VGPRs"));
+    VGPRRegAlloc("vgpr-regalloc", cl::Hidden,
+                 cl::init(&useDefaultRegisterAllocator),
+                 cl::desc("Register allocator to use for VGPRs"));
 
 static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<WWMRegisterRegAlloc>>
@@ -301,22 +304,25 @@ static FunctionPass *createFastWWMRegisterAllocator() {
   return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
 }
 
-static SGPRRegisterRegAlloc basicRegAllocSGPR(
-  "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
-static SGPRRegisterRegAlloc greedyRegAllocSGPR(
-  "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
-
-static SGPRRegisterRegAlloc fastRegAllocSGPR(
-  "fast", "fast register allocator", createFastSGPRRegisterAllocator);
+static SGPRRegisterRegAlloc basicRegAllocSGPR("basic",
+                                              "basic register allocator",
+                                              createBasicSGPRRegisterAllocator);
+static SGPRRegisterRegAlloc
+    greedyRegAllocSGPR("greedy", "greedy register allocator",
+                       createGreedySGPRRegisterAllocator);
 
+static SGPRRegisterRegAlloc fastRegAllocSGPR("fast", "fast register allocator",
+                                             createFastSGPRRegisterAllocator);
 
-static VGPRRegisterRegAlloc basicRegAllocVGPR(
-  "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
-static VGPRRegisterRegAlloc greedyRegAllocVGPR(
-  "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc basicRegAllocVGPR("basic",
+                                              "basic register allocator",
+                                              createBasicVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc
+    greedyRegAllocVGPR("greedy", "greedy register allocator",
+                       createGreedyVGPRRegisterAllocator);
 
-static VGPRRegisterRegAlloc fastRegAllocVGPR(
-  "fast", "fast register allocator", createFastVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc fastRegAllocVGPR("fast", "fast register allocator",
+                                             createFastVGPRRegisterAllocator);
 
 static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
                                                "basic register allocator",
@@ -334,14 +340,14 @@ static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
 } // anonymous namespace
 
 static cl::opt<bool>
-EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
-                        cl::desc("Run early if-conversion"),
-                        cl::init(false));
+    EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
+                            cl::desc("Run early if-conversion"),
+                            cl::init(false));
 
 static cl::opt<bool>
-OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
-            cl::desc("Run pre-RA exec mask optimizations"),
-            cl::init(true));
+    OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
+                     cl::desc("Run pre-RA exec mask optimizations"),
+                     cl::init(true));
 
 static cl::opt<bool>
     LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
@@ -349,32 +355,27 @@ static cl::opt<bool>
                   cl::init(true), cl::Hidden);
 
 // Option to disable vectorizer for tests.
-static cl::opt<bool> EnableLoadStoreVectorizer(
-  "amdgpu-load-store-vectorizer",
-  cl::desc("Enable load store vectorizer"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer",
+                              cl::desc("Enable load store vectorizer"),
+                              cl::init(true), cl::Hidden);
 
 // Option to control global loads scalarization
-static cl::opt<bool> ScalarizeGlobal(
-  "amdgpu-scalarize-global-loads",
-  cl::desc("Enable global load scalarization"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    ScalarizeGlobal("amdgpu-scalarize-global-loads",
+                    cl::desc("Enable global load scalarization"),
+                    cl::init(true), cl::Hidden);
 
 // Option to run internalize pass.
 static cl::opt<bool> InternalizeSymbols(
-  "amdgpu-internalize-symbols",
-  cl::desc("Enable elimination of non-kernel functions and unused globals"),
-  cl::init(false),
-  cl::Hidden);
+    "amdgpu-internalize-symbols",
+    cl::desc("Enable elimination of non-kernel functions and unused globals"),
+    cl::init(false), cl::Hidden);
 
 // Option to inline all early.
-static cl::opt<bool> EarlyInlineAll(
-  "amdgpu-early-inline-all",
-  cl::desc("Inline all functions early"),
-  cl::init(false),
-  cl::Hidden);
+static cl::opt<bool> EarlyInlineAll("amdgpu-early-inline-all",
+                                    cl::desc("Inline all functions early"),
+                                    cl::init(false), cl::Hidden);
 
 static cl::opt<bool> RemoveIncompatibleFunctions(
     "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
@@ -382,39 +383,40 @@ static cl::opt<bool> RemoveIncompatibleFunctions(
              "use features not supported by the target GPU"),
     cl::init(true));
 
-static cl::opt<bool> EnableSDWAPeephole(
-  "amdgpu-sdwa-peephole",
-  cl::desc("Enable SDWA peepholer"),
-  cl::init(true));
+static cl::opt<bool> EnableSDWAPeephole("amdgpu-sdwa-peephole",
+                                        cl::desc("Enable SDWA peepholer"),
+                                        cl::init(true));
 
-static cl::opt<bool> EnableDPPCombine(
-  "amdgpu-dpp-combine",
-  cl::desc("Enable DPP combiner"),
-  cl::init(true));
+static cl::opt<bool> EnableDPPCombine("amdgpu-dpp-combine",
+                                      cl::desc("Enable DPP combiner"),
+                                      cl::init(true));
 
 // Enable address space based alias analysis
-static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
-  cl::desc("Enable AMDGPU Alias Analysis"),
-  cl::init(true));
+static cl::opt<bool>
+    EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
+                              cl::desc("Enable AMDGPU Alias Analysis"),
+                              cl::init(true));
 
 // Enable lib calls simplifications
-static cl::opt<bool> EnableLibCallSimplify(
-  "amdgpu-simplify-libcall",
-  cl::desc("Enable amdgpu library simplifications"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableLibCallSimplify("amdgpu-simplify-libcall",
+                          cl::desc("Enable amdgpu library simplifications"),
+                          cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnableLowerKernelArguments(
-  "amdgpu-ir-lower-kernel-arguments",
-  cl::desc("Lower kernel argument loads in IR pass"),
-  cl::init(true),
-  cl::Hidden);
+    "amdgpu-ir-lower-kernel-arguments",
+    cl::desc("Lower kernel argument loads in IR pass"), cl::init(true),
+    cl::Hidden);
 
 static cl::opt<bool> EnableRegReassign(
-  "amdgpu-reassign-regs",
-  cl::desc("Enable register reassign optimizations on gfx10+"),
-  cl::init(true),
-  cl::Hidden);
+    "amdgpu-reassign-regs",
+    cl::desc("Enable register reassign optimizations on gfx10+"),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableHotBlockRegRenaming(
+    "amdgpu-enable-hot-block-reg-renaming",
+    cl::desc("Enable hot block register renaming to reduce value density"),
+    cl::init(true), cl::Hidden);
 
 static cl::opt<bool> OptVGPRLiveRange(
     "amdgpu-opt-vgpr-liverange",
@@ -432,11 +434,10 @@ static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
         clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
 
 // Enable Mode register optimization
-static cl::opt<bool> EnableSIModeRegisterPass(
-  "amdgpu-mode-register",
-  cl::desc("Enable mode register pass"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableSIModeRegisterPass("amdgpu-mode-register",
+                             cl::desc("Enable mode register pass"),
+                             cl::init(true), cl::Hidden);
 
 // Enable GFX11+ s_delay_alu insertion
 static cl::opt<bool>
@@ -452,19 +453,16 @@ static cl::opt<bool>
 
 // Option is used in lit tests to prevent deadcoding of patterns inspected.
 static cl::opt<bool>
-EnableDCEInRA("amdgpu-dce-in-ra",
-    cl::init(true), cl::Hidden,
-    cl::desc("Enable machine DCE inside regalloc"));
+    EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden,
+                  cl::desc("Enable machine DCE inside regalloc"));
 
 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
                                            cl::desc("Adjust wave priority"),
                                            cl::init(false), cl::Hidden);
 
-static cl::opt<bool> EnableScalarIRPasses(
-  "amdgpu-scalar-ir-passes",
-  cl::desc("Enable scalar IR passes"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool> EnableScalarIRPasses("amdgpu-scalar-ir-passes",
+                                          cl::desc("Enable scalar IR passes"),
+                                          cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
@@ -477,10 +475,10 @@ static cl::opt<bool, true> EnableLowerModuleLDS(
     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool> EnablePreRAOptimizations(
-    "amdgpu-enable-pre-ra-optimizations",
-    cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
-    cl::Hidden);
+static cl::opt<bool>
+    EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations",
+                             cl::desc("Enable Pre-RA optimizations pass"),
+                             cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnablePromoteKernelArguments(
     "amdgpu-enable-promote-kernel-arguments",
@@ -507,10 +505,10 @@ static cl::opt<bool> EnableRewritePartialRegUses(
     cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool> EnableHipStdPar(
-  "amdgpu-enable-hipstdpar",
-  cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableHipStdPar("amdgpu-enable-hipstdpar",
+                    cl::desc("Enable HIP Standard Parallelism Offload support"),
+                    cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
     EnableAMDGPUAttributor("amdgpu-attributor-enable",
@@ -613,6 +611,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
   initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR);
   initializeGCNNSAReassignLegacyPass(*PR);
+  initializeAMDGPUHotBlockRegisterRenamingLegacyPass(*PR);
   initializeGCNPreRAOptimizationsLegacyPass(*PR);
   initializeGCNPreRALongBranchRegLegacyPass(*PR);
   initializeGCNRewritePartialRegUsesLegacyPass(*PR);
@@ -634,8 +633,8 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
 static ScheduleDAGInstrs *
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
-  ScheduleDAGMILive *DAG =
-    new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
+  ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
+      C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
@@ -698,14 +697,13 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
   return DAG;
 }
 
-static MachineSchedRegistry
-SISchedRegistry("si", "Run SI's custom scheduler",
-                createSIMachineScheduler);
+static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler",
+                                            createSIMachineScheduler);
 
 static MachineSchedRegistry
-GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
-                             "Run GCN scheduler to maximize occupancy",
-                             createGCNMaxOccupancyMachineScheduler);
+    GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
+                                 "Run GCN scheduler to maximize occupancy",
+                                 createGCNMaxOccupancyMachineScheduler);
 
 static MachineSchedRegistry
     GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
@@ -962,7 +960,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
   PB.registerFullLinkTimeOptimizationLastEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
-
         // Promote kernel arguments to global address space for LLVM IR
         // generated by flang compiler
         FunctionPassManager FPM;
@@ -1395,7 +1392,7 @@ void AMDGPUPassConfig::addIRPasses() {
                                              AAResults &AAR) {
         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
           AAR.addAAResult(WrapperPass->getResult());
-        }));
+      }));
     }
 
     if (TM.getTargetTriple().isAMDGCN()) {
@@ -1655,6 +1652,10 @@ void GCNPassConfig::addOptimizedRegAlloc() {
 }
 
 bool GCNPassConfig::addPreRewrite() {
+  // Hot block register renaming to reduce value density
+  if (TM->getOptLevel() > CodeGenOptLevel::None && EnableHotBlockRegRenaming)
+    addPass(&AMDGPUHotBlockRegisterRenamingID);
+
   if (EnableRegReassign)
     addPass(&GCNNSAReassignID);
 
@@ -2013,8 +2014,8 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                              AMDGPU::SGPR_32RegClass,
                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
-                             AMDGPU::SGPR_32RegClass,
-                             MFI->ArgInfo.LDSKernelId, 0, 1) ||
+                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.LDSKernelId,
+                             0, 1) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
                              0, 1) ||
@@ -2037,14 +2038,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                              AMDGPU::SReg_64RegClass,
                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDX, 0, 0) ||
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDX,
+                             0, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDY, 0, 0) ||
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDY,
+                             0, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDZ, 0, 0)))
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDZ,
+                             0, 0)))
     return true;
 
   if (ST.hasIEEEMode())
@@ -2251,6 +2252,11 @@ Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
 }
 
 void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const {
+  // Hot block register renaming to reduce value density
+  if (TM.getOptLevel() > CodeGenOptLevel::None && EnableHotBlockRegRenaming) {
+    addPass(AMDGPUHotBlockRegisterRenamingPass());
+  }
+
   if (EnableRegReassign) {
     addPass(GCNNSAReassignPass());
   }
@@ -2353,7 +2359,6 @@ Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
   // For allocating per-thread VGPRs.
   addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}));
 
-
   addPreRewrite(addPass);
   addPass(VirtRegRewriterPass(true));
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 8c0f9d01a7d30..b2d45438021ce 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
+  AMDGPUHotBlockRegisterRenaming.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir b/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir
new file mode 100644
index 0000000000000..28c9c16f248d1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir
@@ -0,0 +1,146 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-enable-hot-block-reg-renaming -verify-machineinstrs -run-pass=greedy,amdgpu-hot-block-reg-renaming,virtregrewriter -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @test_basic_move() { ret void }
+  define amdgpu_kernel void @test_tied_operand() { ret void }
+  define amdgpu_kernel void @test_no_free_registers() #0 { ret void }
+  
+  attributes #0 = { "amdgpu-num-vgpr"="8" }
+...
+
+---
+# Test 1: Verify that we correctly move a value when it's safe to do so
+# Multiple values allocated to vgpr3, one should be moved to a free register
+# CHECK-LABEL: name: test_basic_move
+# CHECK: bb.1:
+# CHECK-NOT: renamable $vgpr3 = V_ADD_F32_e64 0, renamable $vgpr0, 0, renamable $vgpr1
+# CHECK-NOT: renamable $vgpr3 = V_MUL_F32_e64 0, killed renamable $vgpr3, 0, renamable $vgpr2
+# CHECK: renamable $vgpr{{[4-9]|[1-5][0-9]}} = V_MUL_F32_e64
+name:            test_basic_move
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' }
+  - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 5, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 6, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 7, class: vgpr_32, preferred-register: '$vgpr3' }
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = COPY $vgpr2
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    ; High value density: %3, %4, %5, %6, %7 all prefer vgpr3
+    ; Some should be moved to free registers
+    %3:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %5:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %6:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %7:vgpr_32 = V_FMA_F32_e64 0, %3, 0, %2, 0, %4, 0, 0, implicit $mode, implicit $exec
+    S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0, implicit %7
+
+...
+---
+# Test 2: Verify that we do NOT move values with tied operands
+# V_MAC_F32 has tied def-use, should not be moved
+# CHECK-LABEL: name: test_tied_operand
+# CHECK: bb.1:
+# CHECK: renamable $vgpr3 = V_ADD_F32_e32
+# CHECK-NEXT: renamable $vgpr3 = V_MAC_F32_e32
+# CHECK-NEXT: renamable $vgpr3 = V_MUL_F32_e64
+name:            test_tied_operand
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr3' }
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    ; %2 and %3 both prefer vgpr3, but %3 has tied operand
+    %2:vgpr_32 = V_ADD_F32_e32 %1, %0, implicit $mode, implicit $exec
+    %3:vgpr_32 = V_MAC_F32_e32 %0, %1, %2, implicit $mode, implicit $exec
+    ; The pass should NOT move %3 because it has a tied def operand
+    %4:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec
+    S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0, implicit %4
+
+...
+---
+# Test 3: Verify that we do NOT move when there are no free registers
+# With only 8 VGPRs available and all in use, no moves should happen
+# CHECK-LABEL: name: test_no_free_registers
+# CHECK: bb.1:
+# CHECK: renamable $vgpr3 = V_ADD_F32_e64
+# CHECK: renamable $vgpr3 = V_MUL_F32_e64
+name:            test_no_free_registers
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' }
+  - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr4' }
+  - { id: 5, class: vgpr_32, preferred-register: '$vgpr5' }
+  - { id: 6, class: vgpr_32, preferred-register: '$vgpr6' }
+  - { id: 7, class: vgpr_32, preferred-register: '$vgpr7' }
+  - { id: 8, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 9, class: vgpr_32, preferred-register: '$vgpr3' }
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    ; Fill up vgpr2-7 to leave no free registers
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %4:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
+    %5:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+    %6:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
+    %7:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    ; With all registers occupied, pass should not move values
+    %8:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %9:vgpr_32 = V_MUL_F32_e64 0, %8, 0, %1, 0, 0, implicit $mode, implicit $exec
+    S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %9
+
+...
+