Revert "[AMDGPU][Scheduler] Refactor ArchVGPR rematerialization during scheduling (#125885)" (#139341)

And related "[AMDGPU] Regenerate mfma-loop.ll test" Introduce memory error detected by Asan #125885. This reverts commit 382a085a95. This reverts commit 067caaafb5.
2025-05-09 17:51:46 -07:00
parent 436504c3b9
commit b35f6e26a5
13 changed files with 679 additions and 4826 deletions
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -23,7 +23,6 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/RegisterBank.h"
@@ -586,9 +585,6 @@ public:
  /// multiple uses.
  bool hasOneNonDBGUser(Register RegNo) const;

-  /// If the register has a single non-Debug instruction using the specified
-  /// register, returns it; otherwise returns nullptr.
-  MachineInstr *getOneNonDBGUser(Register RegNo) const;

  /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
  /// non-debug user instructions.
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -432,11 +432,6 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
  return hasSingleElement(use_nodbg_instructions(RegNo));
 }

-MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
-  auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
-  return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
-}
-
 bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
                                              unsigned MaxUsers) const {
  return hasNItemsOrLess(use_instr_nodbg_begin(Reg), use_instr_nodbg_end(),
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -53,20 +53,11 @@ struct GCNRegPressure {
  /// UnifiedVGPRFile
  unsigned getVGPRNum(bool UnifiedVGPRFile) const {
    if (UnifiedVGPRFile) {
-      return Value[AGPR32] ? getUnifiedVGPRNum(Value[VGPR32], Value[AGPR32])
+      return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32]
                           : Value[VGPR32] + Value[AGPR32];
    }
    return std::max(Value[VGPR32], Value[AGPR32]);
  }
-
-  /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
-  /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
-  inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
-                                           unsigned NumAGPRs) {
-    return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
-           NumAGPRs;
-  }
-
  /// \returns the ArchVGPR32 pressure
  unsigned getArchVGPRNum() const { return Value[VGPR32]; }
  /// \returns the AccVGPR32 pressure
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -25,13 +25,8 @@

 #include "GCNSchedStrategy.h"
 #include "AMDGPUIGroupLP.h"
-#include "GCNRegPressure.h"
 #include "SIMachineFunctionInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/Support/ErrorHandling.h"

 #define DEBUG_TYPE "machine-scheduler"

@@ -306,11 +301,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
    HasHighPressure = true;
    if (SGPRDelta > VGPRDelta) {
      Cand.RPDelta.CriticalMax =
-          PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
+        PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
      Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
    } else {
      Cand.RPDelta.CriticalMax =
-          PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
+        PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
      Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
    }
  }
@@ -323,7 +318,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                         const RegPressureTracker &RPTracker,
                                         SchedCandidate &Cand,
                                         bool IsBottomUp) {
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
  ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
  unsigned SGPRPressure = 0;
  unsigned VGPRPressure = 0;
@@ -419,7 +414,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
      pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
                        /*IsBottomUp=*/false);
      assert(TCand.SU == TopCand.SU &&
-             "Last pick result should correspond to re-picking right now");
+           "Last pick result should correspond to re-picking right now");
    }
 #endif
  }
@@ -895,13 +890,13 @@ GCNScheduleDAGMILive::getRegionLiveInMap() const {
  std::vector<MachineInstr *> RegionFirstMIs;
  RegionFirstMIs.reserve(Regions.size());
  auto I = Regions.rbegin(), E = Regions.rend();
+  auto *BB = I->first->getParent();
  do {
-    const MachineBasicBlock *MBB = I->first->getParent();
    auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
    RegionFirstMIs.push_back(MI);
    do {
      ++I;
-    } while (I != E && I->first->getParent() == MBB);
+    } while (I != E && I->first->getParent() == BB);
  } while (I != E);
  return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
 }
@@ -1086,46 +1081,31 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
  return true;
 }

-/// Allows to easily filter for this stage's debug output.
-#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
-
 bool PreRARematStage::initGCNSchedStage() {
-  // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
-  // regions inbetween the defs and region we sinked the def to. Will need to be
-  // fixed if there is another pass after this pass.
+  if (!GCNSchedStage::initGCNSchedStage())
+    return false;
+
+  if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
+    return false;
+
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  // Rematerialization will not help if occupancy is not limited by reg usage.
+  if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
+    return false;
+
+  // FIXME: This pass will invalidate cached MBBLiveIns for regions
+  // inbetween the defs and region we sinked the def to. Cached pressure
+  // for regions where a def is sinked from will also be invalidated. Will
+  // need to be fixed if there is another pass after this pass.
  assert(!S.hasNextStage());

-  if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
-      DAG.Regions.size() == 1)
+  collectRematerializableInstructions();
+  if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
    return false;

-  // Before performing any IR modification record the parent region of each MI
-  // and the parent MBB of each region.
-  const unsigned NumRegions = DAG.Regions.size();
-  RegionBB.reserve(NumRegions);
-  for (unsigned I = 0; I < NumRegions; ++I) {
-    RegionBoundaries Region = DAG.Regions[I];
-    for (auto MI = Region.first; MI != Region.second; ++MI)
-      MIRegion.insert({&*MI, I});
-    RegionBB.push_back(Region.first->getParent());
-  }
-
-  if (!canIncreaseOccupancyOrReduceSpill())
-    return false;
-
-  // Rematerialize identified instructions and update scheduler's state.
-  rematerialize();
-  if (GCNTrackers)
-    DAG.RegionLiveOuts.buildLiveRegMap();
-  REMAT_DEBUG(
-      dbgs() << "Retrying function scheduling with new min. occupancy of "
-             << AchievedOcc << " from rematerializing (original was "
-             << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
-  if (AchievedOcc > DAG.MinOccupancy) {
-    DAG.MinOccupancy = AchievedOcc;
-    SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-    MFI.increaseOccupancy(MF, DAG.MinOccupancy);
-  }
+  LLVM_DEBUG(
+      dbgs() << "Retrying function scheduling with improved occupancy of "
+             << DAG.MinOccupancy << " from rematerializing\n");
  return true;
 }

@@ -1513,7 +1493,8 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
      dbgs()
      << "\n\t      *** In shouldRevertScheduling ***\n"
      << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
-  ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
+  ScheduleMetrics MBefore =
+      getScheduleMetrics(DAG.SUnits);
  LLVM_DEBUG(
      dbgs()
      << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
@@ -1546,9 +1527,13 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
 }

 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
-  return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
-         mayCauseSpilling(WavesAfter) ||
-         (IncreaseOccupancy && WavesAfter < TargetOcc);
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+    return true;
+
+  if (mayCauseSpilling(WavesAfter))
+    return true;
+
+  return false;
 }

 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1698,407 +1683,160 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
  return true;
 }

-namespace {
-/// Models excess register pressure in a region and tracks our progress as we
-/// identify rematerialization opportunities.
-struct ExcessRP {
-  /// Number of excess ArchVGPRs.
-  unsigned ArchVGPRs = 0;
-  /// Number of excess AGPRs.
-  unsigned AGPRs = 0;
-  /// For unified register files, number of excess VGPRs.
-  unsigned VGPRs = 0;
-  /// For unified register files with AGPR usage, number of excess ArchVGPRs to
-  /// save before we are able to save a whole allocation granule.
-  unsigned ArchVGPRsToAlignment = 0;
-  /// Whether the region uses AGPRs.
-  bool HasAGPRs = false;
-  /// Whether the subtarget has a unified RF.
-  bool UnifiedRF;
-
-  /// Constructs the excess RP model; determines the excess pressure w.r.t. a
-  /// maximum number of allowed VGPRs.
-  ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
-
-  /// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
-  /// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
-  /// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
-  /// saving these ArchVGPRs helped reduce excess pressure.
-  bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
-
-  /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
-  /// these ArchVGPRs helped reduce excess pressure.
-  bool saveAGPRs(unsigned NumRegs);
-
-  /// Returns whether there is any excess register pressure.
-  operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
-    OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
-       << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
-       << Excess.ArchVGPRsToAlignment << " registers)\n";
-    return OS;
-  }
-#endif
-
-private:
-  static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) {
-    unsigned NumSaved = std::min(LeftToSave, NumRegs);
-    NumRegs -= NumSaved;
-    LeftToSave -= NumSaved;
-    return NumSaved;
-  }
-};
-} // namespace
-
-ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
-                   unsigned MaxVGPRs)
-    : UnifiedRF(ST.hasGFX90AInsts()) {
-  unsigned NumArchVGPRs = RP.getArchVGPRNum();
-  unsigned NumAGPRs = RP.getAGPRNum();
-  HasAGPRs = NumAGPRs;
-
-  if (!UnifiedRF) {
-    // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
-    // independently.
-    if (NumArchVGPRs > MaxVGPRs)
-      ArchVGPRs = NumArchVGPRs - MaxVGPRs;
-    if (NumAGPRs > MaxVGPRs)
-      AGPRs = NumAGPRs - MaxVGPRs;
-    return;
-  }
-
-  // Independently of whether overall VGPR pressure is under the limit, we still
-  // have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
-  // number of addressable registers in each category.
-  const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
-  if (NumArchVGPRs > MaxArchVGPRs) {
-    ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
-    NumArchVGPRs = MaxArchVGPRs;
-  }
-  if (NumAGPRs > MaxArchVGPRs) {
-    AGPRs = NumAGPRs - MaxArchVGPRs;
-    NumAGPRs = MaxArchVGPRs;
-  }
-
-  // Check overall VGPR usage against the limit; any excess above addressable
-  // register limits has already been accounted for.
-  const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
-  unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
-  if (NumVGPRs > MaxVGPRs) {
-    VGPRs = NumVGPRs - MaxVGPRs;
-    ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, Granule);
-    if (!ArchVGPRsToAlignment)
-      ArchVGPRsToAlignment = Granule;
-  }
-}
-
-bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
-  bool Progress = saveRegs(ArchVGPRs, NumRegs);
-  if (!NumRegs)
-    return Progress;
-
-  if (!UnifiedRF) {
-    if (UseArchVGPRForAGPRSpill)
-      Progress |= saveRegs(AGPRs, NumRegs);
-  } else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) {
-    // There is progress as long as there are VGPRs left to save, even if the
-    // save induced by this particular call does not cross an ArchVGPR alignment
-    // barrier.
-    Progress = true;
-
-    // ArchVGPRs can only be allocated as a multiple of a granule in unified RF.
-    unsigned NumSavedRegs = 0;
-
-    // Count the number of whole ArchVGPR allocation granules we can save.
-    const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
-    if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
-      NumSavedRegs = NumGranules * Granule;
-      NumRegs -= NumSavedRegs;
-    }
-
-    // We may be able to save one more whole ArchVGPR allocation granule.
-    if (NumRegs >= ArchVGPRsToAlignment) {
-      NumSavedRegs += Granule;
-      ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
-    } else {
-      ArchVGPRsToAlignment -= NumRegs;
-    }
-
-    // Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
-    // spilling and have some free ArchVGPR slots.
-    saveRegs(VGPRs, NumSavedRegs);
-    if (UseArchVGPRForAGPRSpill)
-      saveRegs(AGPRs, NumSavedRegs);
-  } else {
-    // No AGPR usage in the region i.e., no allocation granule to worry about.
-    Progress |= saveRegs(VGPRs, NumRegs);
-  }
-
-  return Progress;
-}
-
-bool ExcessRP::saveAGPRs(unsigned NumRegs) {
-  return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
-}
-
-bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
+void PreRARematStage::collectRematerializableInstructions() {
  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
+  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (!DAG.LIS->hasInterval(Reg))
+      continue;

-  REMAT_DEBUG({
-    dbgs() << "Collecting rematerializable instructions in ";
-    MF.getFunction().printAsOperand(dbgs(), false);
-    dbgs() << '\n';
-  });
+    // TODO: Handle AGPR and SGPR rematerialization
+    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
+      continue;

-  // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
-  // occupancy, or regions with VGPR spilling) to a model of their excess RP.
-  DenseMap<unsigned, ExcessRP> OptRegions;
-  const Function &F = MF.getFunction();
+    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
+    MachineInstr *Def = Op->getParent();
+    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
+      continue;

-  std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
-  const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
-  const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
-  const unsigned MaxSGPRsIncOcc =
-      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
-  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
-  IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
+    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
+    if (Def->getParent() == UseI->getParent())
+      continue;

-  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
-    if (Cond) {
-      // We won't try to increase occupancy.
-      IncreaseOccupancy = false;
-      OptRegions.clear();
-    }
-    return Cond;
-  };
-
-  // Collect optimizable regions. If there is spilling in any region we will
-  // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
-  // occupancy by one in the whole function.
-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-    GCNRegPressure &RP = DAG.Pressure[I];
-
-    // Check whether SGPR pressures prevents us from eliminating spilling.
-    unsigned NumSGPRs = RP.getSGPRNum();
-    if (NumSGPRs > MaxSGPRsNoSpill)
-      ClearOptRegionsIf(IncreaseOccupancy);
-
-    ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
-    if (Excess) {
-      ClearOptRegionsIf(IncreaseOccupancy);
-    } else if (IncreaseOccupancy) {
-      // Check whether SGPR pressure prevents us from increasing occupancy.
-      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
-        if (DAG.MinOccupancy >= WavesPerEU.first)
-          return false;
-        continue;
+    bool HasRematDependency = false;
+    // Check if this instruction uses any registers that are planned to be
+    // rematerialized
+    for (auto &RematEntry : RematerializableInsts) {
+      if (find_if(RematEntry.second,
+                  [&Def](std::pair<MachineInstr *, MachineInstr *> &Remat) {
+                    for (MachineOperand &MO : Def->operands()) {
+                      if (!MO.isReg())
+                        continue;
+                      if (MO.getReg() == Remat.first->getOperand(0).getReg())
+                        return true;
+                    }
+                    return false;
+                  }) != RematEntry.second.end()) {
+        HasRematDependency = true;
+        break;
      }
-      if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
-        // We can only rematerialize ArchVGPRs at this point.
-        unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
-        bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
-        if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
-          if (DAG.MinOccupancy >= WavesPerEU.first)
-            return false;
-          continue;
+    }
+    // Do not rematerialize an instruction if it uses an instruction that we
+    // have designated for rematerialization.
+    // FIXME: Allow for rematerialization chains: this requires 1. updating
+    // remat points to account for uses that are rematerialized, and 2. either
+    // rematerializing the candidates in careful ordering, or deferring the MBB
+    // RP walk until the entire chain has been rematerialized.
+    if (HasRematDependency)
+      continue;
+
+    // Similarly, check if the UseI is planned to be remat.
+    for (auto &RematEntry : RematerializableInsts) {
+      if (find_if(RematEntry.second,
+                  [&UseI](std::pair<MachineInstr *, MachineInstr *> &Remat) {
+                    return Remat.first == UseI;
+                  }) != RematEntry.second.end()) {
+        HasRematDependency = true;
+        break;
+      }
+    }
+
+    if (HasRematDependency)
+      break;
+
+    // We are only collecting defs that are defined in another block and are
+    // live-through or used inside regions at MinOccupancy. This means that the
+    // register must be in the live-in set for the region.
+    bool AddedToRematList = false;
+    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+      auto It = DAG.LiveIns[I].find(Reg);
+      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
+        if (DAG.RegionsWithMinOcc[I]) {
+          SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
+          SlotIndex UseIdx =
+              DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
+          if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
+            RematerializableInsts[I][Def] = UseI;
+            AddedToRematList = true;
+          }
        }
+
+        // Collect regions with rematerializable reg as live-in to avoid
+        // searching later when updating RP.
+        RematDefToLiveInRegions[Def].push_back(I);
      }
    }
-    if (Excess)
-      OptRegions.insert({I, Excess});
+    if (!AddedToRematList)
+      RematDefToLiveInRegions.erase(Def);
  }
-  if (OptRegions.empty())
-    return false;
-
-#ifndef NDEBUG
-  if (IncreaseOccupancy)
-    REMAT_DEBUG(dbgs() << "Occupancy minimal in regions:\n");
-  else
-    REMAT_DEBUG(dbgs() << "Spilling in regions:\n");
-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-    if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
-      REMAT_DEBUG(dbgs() << "  " << I << ": " << OptIt->getSecond() << '\n');
-  }
-#endif
-
-  // When we are reducing spilling, the target is the minimum target number of
-  // waves/EU determined by the subtarget.
-  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
-
-  // Accounts for a reduction in RP in an optimizable region. Returns whether we
-  // estimate that we have identified enough rematerialization opportunities to
-  // achieve our goal, and sets Progress to true when this particular reduction
-  // in pressure was helpful toward that goal.
-  auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
-                              bool &Progress) -> bool {
-    ExcessRP &Excess = OptIt->getSecond();
-    // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
-    // only when we are just trying to eliminate spilling to memory. At this
-    // point we err on the conservative side and do not increase
-    // register-to-register spilling for the sake of increasing occupancy.
-    Progress |=
-        Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
-                             /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
-    if (!Excess)
-      OptRegions.erase(OptIt->getFirst());
-    return OptRegions.empty();
-  };
-
-  // We need up-to-date live-out info. to query live-out register masks in
-  // regions containing rematerializable instructions.
-  DAG.RegionLiveOuts.buildLiveRegMap();
-
-  // Cache set of registers that are going to be rematerialized.
-  DenseSet<unsigned> RematRegs;
-
-  // Identify rematerializable instructions in the function.
-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-    auto Region = DAG.Regions[I];
-    for (auto MI = Region.first; MI != Region.second; ++MI) {
-      // The instruction must be trivially rematerializable.
-      MachineInstr &DefMI = *MI;
-      if (!isTriviallyReMaterializable(DefMI))
-        continue;
-
-      // We only support rematerializing virtual VGPRs with one definition.
-      Register Reg = DefMI.getOperand(0).getReg();
-      if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
-          !DAG.MRI.hasOneDef(Reg))
-        continue;
-
-      // We only care to rematerialize the instruction if it has a single
-      // non-debug user in a different region. The using MI may not belong to a
-      // region if it is a lone region terminator.
-      MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
-      if (!UseMI)
-        continue;
-      auto UseRegion = MIRegion.find(UseMI);
-      if (UseRegion != MIRegion.end() && UseRegion->second == I)
-        continue;
-
-      // Do not rematerialize an instruction if it uses or is used by an
-      // instruction that we have designated for rematerialization.
-      // FIXME: Allow for rematerialization chains: this requires 1. updating
-      // remat points to account for uses that are rematerialized, and 2. either
-      // rematerializing the candidates in careful ordering, or deferring the
-      // MBB RP walk until the entire chain has been rematerialized.
-      if (Rematerializations.contains(UseMI) ||
-          llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
-            return MO.isReg() && RematRegs.contains(MO.getReg());
-          }))
-        continue;
-
-      // Do not rematerialize an instruction it it uses registers that aren't
-      // available at its use. This ensures that we are not extending any live
-      // range while rematerializing.
-      SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
-      SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
-      if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
-        continue;
-
-      REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
-      RematInstruction &Remat =
-          Rematerializations.try_emplace(&DefMI, UseMI).first->second;
-
-      bool RematUseful = false;
-      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
-        // Optimistically consider that moving the instruction out of its
-        // defining region will reduce RP in the latter; this assumes that
-        // maximum RP in the region is reached somewhere between the defining
-        // instruction and the end of the region.
-        REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
-        LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
-        if (ReduceRPInRegion(It, Mask, RematUseful))
-          return true;
-      }
-
-      for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
-        // We are only collecting regions in which the register is a live-in
-        // (and may be live-through).
-        auto It = DAG.LiveIns[LIRegion].find(Reg);
-        if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
-          continue;
-        Remat.LiveInRegions.insert(LIRegion);
-
-        // Account for the reduction in RP due to the rematerialization in an
-        // optimizable region in which the defined register is a live-in. This
-        // is exact for live-through region but optimistic in the using region,
-        // where RP is actually reduced only if maximum RP is reached somewhere
-        // between the beginning of the region and the rematerializable
-        // instruction's use.
-        if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
-          REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
-          if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
-            return true;
-        }
-      }
-
-      // If the instruction is not a live-in or live-out in any optimizable
-      // region then there is no point in rematerializing it.
-      if (!RematUseful) {
-        Rematerializations.pop_back();
-        REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
-      } else {
-        RematRegs.insert(Reg);
-      }
-    }
-  }
-
-  if (IncreaseOccupancy) {
-    // We were trying to increase occupancy but failed, abort the stage.
-    REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
-    Rematerializations.clear();
-    return false;
-  }
-  REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
-  return !Rematerializations.empty();
 }

-void PreRARematStage::rematerialize() {
-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+                                              const TargetInstrInfo *TII) {
+  // Temporary copies of cached variables we will be modifying and replacing if
+  // sinking succeeds.
+  SmallVector<
+      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
+      NewRegions;
+  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
+  DenseMap<unsigned, GCNRegPressure> NewPressure;
+  BitVector NewRescheduleRegions;
+  LiveIntervals *LIS = DAG.LIS;

-  // Collect regions whose RP changes in unpredictable way; we will have to
-  // fully recompute their RP after all rematerailizations.
-  DenseSet<unsigned> RecomputeRP;
+  NewRegions.resize(DAG.Regions.size());
+  NewRescheduleRegions.resize(DAG.Regions.size());

-  // Rematerialize all instructions.
-  for (auto &[DefMI, Remat] : Rematerializations) {
-    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
-    Register Reg = DefMI->getOperand(0).getReg();
-    unsigned SubReg = DefMI->getOperand(0).getSubReg();
-    unsigned DefRegion = MIRegion.at(DefMI);
+  // Collect only regions that has a rematerializable def as a live-in.
+  SmallSet<unsigned, 16> ImpactedRegions;
+  for (const auto &It : RematDefToLiveInRegions)
+    ImpactedRegions.insert_range(It.second);

-    // Rematerialize DefMI to its use block.
-    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
-                       *DAG.TRI);
-    Remat.RematMI = &*std::prev(InsertPos);
-    Remat.RematMI->getOperand(0).setSubReg(SubReg);
-    DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
+  // Make copies of register pressure and live-ins cache that will be updated
+  // as we rematerialize.
+  for (auto Idx : ImpactedRegions) {
+    NewPressure[Idx] = DAG.Pressure[Idx];
+    NewLiveIns[Idx] = DAG.LiveIns[Idx];
+  }
+  NewRegions = DAG.Regions;
+  NewRescheduleRegions.reset();

-    // Update region boundaries in regions we sinked from (remove defining MI)
-    // and to (insert MI rematerialized in use block). Only then we can erase
-    // the original MI.
-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
-    auto UseRegion = MIRegion.find(Remat.UseMI);
-    if (UseRegion != MIRegion.end()) {
-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
-                                 Remat.RematMI);
+  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
+  bool Improved = false;
+  for (auto I : ImpactedRegions) {
+    if (!DAG.RegionsWithMinOcc[I])
+      continue;
+
+    Improved = false;
+    int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
+    int SGPRUsage = NewPressure[I].getSGPRNum();
+
+    // TODO: Handle occupancy drop due to AGPR and SGPR.
+    // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
+    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
+      break;
+
+    // The occupancy of this region could have been improved by a previous
+    // iteration's sinking of defs.
+    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
+      NewRescheduleRegions[I] = true;
+      Improved = true;
+      continue;
    }
-    DefMI->eraseFromParent();
-    DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
-
-    // Collect all regions impacted by the rematerialization and update their
-    // live-in/RP information.
-    for (unsigned I : Remat.LiveInRegions) {
-      ImpactedRegions.insert({I, DAG.Pressure[I]});
-      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];

+    // First check if we have enough trivially rematerializable instructions to
+    // improve occupancy. Optimistically assume all instructions we are able to
+    // sink decreased RP.
+    int TotalSinkableRegs = 0;
+    for (const auto &It : RematerializableInsts[I]) {
+      MachineInstr *Def = It.first;
+      Register DefReg = Def->getOperand(0).getReg();
+      TotalSinkableRegs +=
+          SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
 #ifdef EXPENSIVE_CHECKS
      // All uses are known to be available / live at the remat point. Thus, the
      // uses should already be live in to the region.
-      for (MachineOperand &MO : DefMI->operands()) {
+      for (MachineOperand &MO : Def->operands()) {
        if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
          continue;

@@ -2106,12 +1844,13 @@ void PreRARematStage::rematerialize() {
        if (!UseReg.isVirtual())
          continue;

-        LiveInterval &LI = DAG.LIS->getInterval(UseReg);
+        LiveInterval &LI = LIS->getInterval(UseReg);
        LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
        if (LI.hasSubRanges() && MO.getSubReg())
          LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());

-        LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
+        assert(NewLiveIns[I].contains(UseReg));
+        LaneBitmask LiveInMask = NewLiveIns[I][UseReg];
        LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
        // If this register has lanes not covered by the LiveIns, be sure they
        // do not map to any subrange. ref:
@@ -2123,64 +1862,126 @@ void PreRARematStage::rematerialize() {
        }
      }
 #endif
-
-      // The register is no longer a live-in in all regions but the one that
-      // contains the single use. In live-through regions, maximum register
-      // pressure decreases predictably so we can directly update it. In the
-      // using region, maximum RP may or may not decrease, so we will mark it
-      // for re-computation after all materializations have taken place.
-      LaneBitmask PrevMask = RegionLiveIns[Reg];
-      RegionLiveIns.erase(Reg);
-      RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
-      if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
-        DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
-      else
-        RecomputeRP.insert(I);
    }
-    // RP in the region from which the instruction was rematerialized may or may
-    // not decrease.
-    ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
-    RecomputeRP.insert(DefRegion);
+    int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
+    unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
+    // If in the most optimistic scenario, we cannot improve occupancy, then do
+    // not attempt to sink any instructions.
+    if (OptimisticOccupancy <= DAG.MinOccupancy)
+      break;

-    // Recompute live interval to reflect the register's rematerialization.
-    Register RematReg = Remat.RematMI->getOperand(0).getReg();
-    DAG.LIS->removeInterval(RematReg);
-    DAG.LIS->createAndComputeVirtRegInterval(RematReg);
-  }
+    unsigned ImproveOccupancy = 0;
+    SmallVector<MachineInstr *, 4> SinkedDefs;
+    for (auto &It : RematerializableInsts[I]) {
+      MachineInstr *Def = It.first;
+      MachineBasicBlock::iterator InsertPos =
+          MachineBasicBlock::iterator(It.second);
+      Register Reg = Def->getOperand(0).getReg();
+      // Rematerialize MI to its use block.
+      TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+                         Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
+      MachineInstr *NewMI = &*std::prev(InsertPos);
+      LIS->InsertMachineInstrInMaps(*NewMI);
+      LIS->removeInterval(Reg);
+      LIS->createAndComputeVirtRegInterval(Reg);
+      InsertedMIToOldDef[NewMI] = Def;

-  // All regions impacted by at least one rematerialization must be rescheduled.
-  // Maximum pressure must also be recomputed for all regions where it changed
-  // non-predictably and checked against the target occupancy.
-  AchievedOcc = TargetOcc;
-  for (auto &[I, OriginalRP] : ImpactedRegions) {
-    bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
-    DAG.RescheduleRegions[I] = !IsEmptyRegion;
-    if (!RecomputeRP.contains(I))
-      continue;
+      // Update region boundaries in scheduling region we sinked from since we
+      // may sink an instruction that was at the beginning or end of its region
+      DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
+                                 /*Removing =*/true);

-    GCNRegPressure RP;
-    if (IsEmptyRegion) {
-      RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
-    } else {
-      GCNDownwardRPTracker RPT(*DAG.LIS);
-      auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
-                                                      DAG.Regions[I].second);
-      if (NonDbgMI == DAG.Regions[I].second) {
-        // Region is non-empty but contains only debug instructions.
-        RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
-      } else {
-        RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
-        RPT.advance(DAG.Regions[I].second);
-        RP = RPT.moveMaxPressure();
+      // Update region boundaries in region we sinked to.
+      DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
+
+      LaneBitmask PrevMask = NewLiveIns[I][Reg];
+      // FIXME: Also update cached pressure for where the def was sinked from.
+      // Update RP for all regions that has this reg as a live-in and remove
+      // the reg from all regions as a live-in.
+      for (auto Idx : RematDefToLiveInRegions[Def]) {
+        NewLiveIns[Idx].erase(Reg);
+        if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
+          // Def is live-through and not used in this block.
+          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+        } else {
+          // Def is used and rematerialized into this block.
+          GCNDownwardRPTracker RPT(*LIS);
+          auto *NonDbgMI = &*skipDebugInstructionsForward(
+              NewRegions[Idx].first, NewRegions[Idx].second);
+          RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
+          RPT.advance(NewRegions[Idx].second);
+          NewPressure[Idx] = RPT.moveMaxPressure();
+        }
      }
+
+      SinkedDefs.push_back(Def);
+      ImproveOccupancy = NewPressure[I].getOccupancy(ST);
+      if (ImproveOccupancy > DAG.MinOccupancy)
+        break;
    }
-    DAG.Pressure[I] = RP;
-    AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
+
+    // Remove defs we just sinked from all regions' list of sinkable defs
+    for (auto &Def : SinkedDefs)
+      for (auto TrackedIdx : RematDefToLiveInRegions[Def])
+        RematerializableInsts[TrackedIdx].erase(Def);
+
+    if (ImproveOccupancy <= DAG.MinOccupancy)
+      break;
+
+    NewRescheduleRegions[I] = true;
+    Improved = true;
  }
-  REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
+
+  if (!Improved) {
+    // Occupancy was not improved for all regions that were at MinOccupancy.
+    // Undo sinking and remove newly rematerialized instructions.
+    for (auto &Entry : InsertedMIToOldDef) {
+      MachineInstr *MI = Entry.first;
+      MachineInstr *OldMI = Entry.second;
+      Register Reg = MI->getOperand(0).getReg();
+      LIS->RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+      OldMI->clearRegisterDeads(Reg);
+      LIS->removeInterval(Reg);
+      LIS->createAndComputeVirtRegInterval(Reg);
+    }
+    return false;
+  }
+
+  // Occupancy was improved for all regions.
+  for (auto &Entry : InsertedMIToOldDef) {
+    MachineInstr *MI = Entry.first;
+    MachineInstr *OldMI = Entry.second;
+
+    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
+    DAG.BBLiveInMap.erase(OldMI);
+
+    // Remove OldMI and update LIS
+    Register Reg = MI->getOperand(0).getReg();
+    LIS->RemoveMachineInstrFromMaps(*OldMI);
+    OldMI->eraseFromParent();
+    LIS->removeInterval(Reg);
+    LIS->createAndComputeVirtRegInterval(Reg);
+  }
+
+  // Update live-ins, register pressure, and regions caches.
+  for (auto Idx : ImpactedRegions) {
+    DAG.LiveIns[Idx] = NewLiveIns[Idx];
+    DAG.Pressure[Idx] = NewPressure[Idx];
+    DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
+  }
+  DAG.Regions = NewRegions;
+  DAG.RescheduleRegions = NewRescheduleRegions;
+
+  if (GCNTrackers)
+    DAG.RegionLiveOuts.buildLiveRegMap();
+
+  SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
+
+  return true;
 }

-// Copied from MachineLICM
 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
  if (!DAG.TII->isTriviallyReMaterializable(MI))
    return false;
@@ -2198,83 +1999,46 @@ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
  return true;
 }

-void PreRARematStage::finalizeGCNSchedStage() {
-  // We consider that reducing spilling is always beneficial so we never
-  // rollback rematerializations in such cases. It's also possible that
-  // rescheduling lowers occupancy over the one achieved just through remats, in
-  // which case we do not want to rollback either (the rescheduling was already
-  // reverted in PreRARematStage::shouldRevertScheduling in such cases).
-  unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
-  if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
-    return;
-
-  REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
-
-  // Rollback the rematerializations.
-  for (const auto &[DefMI, Remat] : Rematerializations) {
-    MachineInstr &RematMI = *Remat.RematMI;
-    unsigned DefRegion = MIRegion.at(DefMI);
-    MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
-    MachineBasicBlock *MBB = RegionBB[DefRegion];
-    Register Reg = RematMI.getOperand(0).getReg();
-    unsigned SubReg = RematMI.getOperand(0).getSubReg();
-
-    // Re-rematerialize MI at the end of its original region. Note that it may
-    // not be rematerialized exactly in the same position as originally within
-    // the region, but it should not matter much.
-    TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
-    MachineInstr *NewMI = &*std::prev(InsertPos);
-    NewMI->getOperand(0).setSubReg(SubReg);
-    DAG.LIS->InsertMachineInstrInMaps(*NewMI);
-
-    auto UseRegion = MIRegion.find(Remat.UseMI);
-    if (UseRegion != MIRegion.end()) {
-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
-                                 nullptr);
-    }
-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
-
-    // Erase rematerialized MI.
-    RematMI.eraseFromParent();
-    DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
-
-    // Recompute live interval for the re-rematerialized register
-    DAG.LIS->removeInterval(Reg);
-    DAG.LIS->createAndComputeVirtRegInterval(Reg);
-
-    // Re-add the register as a live-in in all regions it used to be one in.
-    for (unsigned LIRegion : Remat.LiveInRegions)
-      DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
-  }
-
-  // Reset RP in all impacted regions.
-  for (auto &[I, OriginalRP] : ImpactedRegions)
-    DAG.Pressure[I] = OriginalRP;
-
-  GCNSchedStage::finalizeGCNSchedStage();
-}
-
+// When removing, we will have to check both beginning and ending of the region.
+// When inserting, we will only have to check if we are inserting NewMI in front
+// of a scheduling region and do not need to check the ending since we will only
+// ever be inserting before an already existing MI.
 void GCNScheduleDAGMILive::updateRegionBoundaries(
-    RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
-    MachineInstr *NewMI) {
-  assert(!NewMI ||
-         NewMI != RegionBounds.second && "cannot remove at region end");
+    SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+                              MachineBasicBlock::iterator>> &RegionBoundaries,
+    MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
+  unsigned I = 0, E = RegionBoundaries.size();
+  // Search for first region of the block where MI is located
+  while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
+    ++I;

-  if (RegionBounds.first == RegionBounds.second) {
-    assert(NewMI && "cannot remove from an empty region");
-    RegionBounds.first = NewMI;
-    return;
+  for (; I != E; ++I) {
+    if (MI->getParent() != RegionBoundaries[I].first->getParent())
+      return;
+
+    if (Removing && MI == RegionBoundaries[I].first &&
+        MI == RegionBoundaries[I].second) {
+      // MI is in a region with size 1, after removing, the region will be
+      // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
+      RegionBoundaries[I] =
+          std::pair(MI->getParent()->end(), MI->getParent()->end());
+      return;
+    }
+    if (MI == RegionBoundaries[I].first) {
+      if (Removing)
+        RegionBoundaries[I] =
+            std::pair(std::next(MI), RegionBoundaries[I].second);
+      else
+        // Inserted NewMI in front of region, set new RegionBegin to NewMI
+        RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
+                                        RegionBoundaries[I].second);
+      return;
+    }
+    if (Removing && MI == RegionBoundaries[I].second) {
+      RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
+      return;
+    }
  }
-
-  // We only care for modifications at the beginning of a non-empty region since
-  // the upper region boundary is exclusive.
-  if (MI != RegionBounds.first)
-    return;
-  if (!NewMI)
-    RegionBounds.first = std::next(MI); // Removal
-  else
-    RegionBounds.first = NewMI; // Insertion
 }

 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,9 +14,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H

 #include "GCNRegPressure.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"

 namespace llvm {
@@ -216,11 +214,6 @@ public:
  }
 };

-/// A region's boundaries i.e. a pair of instruction bundle iterators. The lower
-/// boundary is inclusive, the upper boundary is exclusive.
-using RegionBoundaries =
-    std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>;
-
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
  friend class GCNSchedStage;
  friend class OccInitialScheduleStage;
@@ -241,7 +234,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
  unsigned MinOccupancy;

  // Vector of regions recorder for later rescheduling
-  SmallVector<RegionBoundaries, 32> Regions;
+  SmallVector<std::pair<MachineBasicBlock::iterator,
+                        MachineBasicBlock::iterator>, 32> Regions;

  // Records if a region is not yet scheduled, or schedule has been reverted,
  // or we generally desire to reschedule it.
@@ -292,13 +286,12 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
  // Compute and cache live-ins and pressure for all regions in block.
  void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);

-  /// If necessary, updates a region's boundaries following insertion ( \p NewMI
-  /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
-  /// For an MI removal, this must be called before the MI is actually erased
-  /// from its parent MBB.
-  void updateRegionBoundaries(RegionBoundaries &RegionBounds,
-                              MachineBasicBlock::iterator MI,
-                              MachineInstr *NewMI);
+  // Update region boundaries when removing MI or inserting NewMI before MI.
+  void updateRegionBoundaries(
+      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+                                MachineBasicBlock::iterator>> &RegionBoundaries,
+      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
+      bool Removing = false);

  void runSchedStages();

@@ -438,73 +431,30 @@ public:
      : GCNSchedStage(StageID, DAG) {}
 };

-/// Attempts to reduce function spilling or, if there is no spilling, to
-/// increase function occupancy by one with respect to ArchVGPR usage by sinking
-/// trivially rematerializable instructions to their use. When the stage
-/// estimates reducing spilling or increasing occupancy is possible, as few
-/// instructions as possible are rematerialized to reduce potential negative
-/// effects on function latency.
-///
-/// TODO: We should extend this to work on SGPRs and AGPRs as well.
 class PreRARematStage : public GCNSchedStage {
 private:
-  /// Useful information about a rematerializable instruction.
-  struct RematInstruction {
-    /// Single use of the rematerializable instruction's defined register,
-    /// located in a different block.
-    MachineInstr *UseMI;
-    /// Rematerialized version of \p DefMI, set in
-    /// PreRARematStage::rematerialize. Used for reverting rematerializations.
-    MachineInstr *RematMI;
-    /// Set of regions in which the rematerializable instruction's defined
-    /// register is a live-in.
-    SmallDenseSet<unsigned, 4> LiveInRegions;
+  // Each region at MinOccupancy will have their own list of trivially
+  // rematerializable instructions we can remat to reduce RP. The list maps an
+  // instruction to the position we should remat before, usually the MI using
+  // the rematerializable instruction.
+  MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
+      RematerializableInsts;

-    RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
-  };
+  // Map a trivially rematerializable def to a list of regions at MinOccupancy
+  // that has the defined reg as a live-in.
+  MapVector<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;

-  /// Maps all MIs to their parent region. MI terminators are considered to be
-  /// outside the region they delimitate, and as such are not stored in the map.
-  DenseMap<MachineInstr *, unsigned> MIRegion;
-  /// Parent MBB to each region, in region order.
-  SmallVector<MachineBasicBlock *> RegionBB;
-  /// Collects instructions to rematerialize.
-  MapVector<MachineInstr *, RematInstruction> Rematerializations;
-  /// Collects regions whose live-ins or register pressure will change due to
-  /// rematerializations.
-  DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
-  /// In case we need to rollback rematerializations, save lane masks for all
-  /// rematerialized registers in all regions in which they are live-ins.
-  DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
-  /// Target occupancy the stage estimates is reachable through
-  /// rematerialization. Greater than or equal to the pre-stage min occupancy.
-  unsigned TargetOcc;
-  /// Achieved occupancy *only* through rematerializations (pre-rescheduling).
-  /// Smaller than or equal to the target occupancy.
-  unsigned AchievedOcc;
-  /// Whether the stage is attempting to increase occupancy in the abscence of
-  /// spilling.
-  bool IncreaseOccupancy;
+  // Collect all trivially rematerializable VGPR instructions with a single def
+  // and single use outside the defining block into RematerializableInsts.
+  void collectRematerializableInstructions();

-  /// Returns whether remat can reduce spilling or increase function occupancy
-  /// by 1 through rematerialization. If it can do one, collects instructions in
-  /// PreRARematStage::Rematerializations and sets the target occupancy in
-  /// PreRARematStage::TargetOccupancy.
-  bool canIncreaseOccupancyOrReduceSpill();
-
-  /// Whether the MI is trivially rematerializable and does not have any virtual
-  /// register use.
  bool isTriviallyReMaterializable(const MachineInstr &MI);

-  /// Rematerializes all instructions in PreRARematStage::Rematerializations
-  /// and stores the achieved occupancy after remat in
-  /// PreRARematStage::AchievedOcc.
-  void rematerialize();
-
-  /// If remat alone did not increase occupancy to the target one, rollbacks all
-  /// rematerializations and resets live-ins/RP in all regions impacted by the
-  /// stage to their pre-stage values.
-  void finalizeGCNSchedStage() override;
+  // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
+  // Attempt to reduce RP of VGPR by sinking trivially rematerializable
+  // instructions. Returns true if we were able to sink instruction(s).
+  bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
+                               const TargetInstrInfo *TII);

  /// \p Returns true if all the uses in \p InstToRemat defined at \p
  /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -466,7 +466,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
                            getReservedNumSGPRs(MF));
 }

-unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
+static unsigned getMaxNumPreloadedSGPRs() {
  using USI = GCNUserSGPRUsageInfo;
  // Max number of user SGPRs
  const unsigned MaxUserSGPRs =
@@ -497,28 +497,42 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
 }

 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
-    const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
-  const auto &[Min, Max] = NumVGPRBounds;
+    const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
+  // Compute maximum number of VGPRs function can use using default/requested
+  // minimum number of waves per execution unit.
+  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);

  // Check if maximum number of VGPRs was explicitly requested using
  // "amdgpu-num-vgpr" attribute.
+  unsigned Requested =
+      F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
+  if (Requested != MaxNumVGPRs) {
+    if (hasGFX90AInsts())
+      Requested *= 2;

-  unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
-  if (Requested != Max && hasGFX90AInsts())
-    Requested *= 2;
+    // Make sure requested value is compatible with values implied by
+    // default/requested minimum/maximum number of waves per execution unit.
+    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
+      Requested = 0;
+    if (WavesPerEU.second && Requested &&
+        Requested < getMinNumVGPRs(WavesPerEU.second))
+      Requested = 0;

-  // Make sure requested value is inside the range of possible VGPR usage.
-  return std::clamp(Requested, Min, Max);
+    if (Requested)
+      MaxNumVGPRs = Requested;
+  }
+
+  return MaxNumVGPRs;
 }

 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
-  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
-  return getBaseMaxNumVGPRs(
-      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
+  return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
 }

 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
-  return getMaxNumVGPRs(MF.getFunction());
+  const Function &F = MF.getFunction();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
 }

 void GCNSubtarget::adjustSchedDependency(
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1505,9 +1505,6 @@ public:
  /// \returns Reserved number of SGPRs for given function \p F.
  unsigned getReservedNumSGPRs(const Function &F) const;

-  /// \returns Maximum number of preloaded SGPRs for the subtarget.
-  unsigned getMaxNumPreloadedSGPRs() const;
-
  /// \returns max num SGPRs. This is the common utility
  /// function called by MachineFunction and Function
  /// variants of getMaxNumSGPRs.
@@ -1576,10 +1573,8 @@ public:

  /// \returns max num VGPRs. This is the common utility function
  /// called by MachineFunction and Function variants of getMaxNumVGPRs.
-  unsigned
-  getBaseMaxNumVGPRs(const Function &F,
-                     std::pair<unsigned, unsigned> NumVGPRBounds) const;
-
+  unsigned getBaseMaxNumVGPRs(const Function &F,
+                              std::pair<unsigned, unsigned> WavesPerEU) const;
  /// \returns Maximum number of VGPRs that meets number of waves per execution
  /// unit requirement for function \p F, or number of VGPRs explicitly
  /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1190,8 +1190,6 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
  return IsWave32 ? 8 : 4;
 }

-unsigned getArchVGPRAllocGranule() { return 4; }
-
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
    return 512;
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -309,10 +309,6 @@ unsigned getVGPREncodingGranule(
    const MCSubtargetInfo *STI,
    std::optional<bool> EnableWavefrontSize32 = std::nullopt);

-/// For subtargets with a unified VGPR file and mixed ArchVGPR/AGPR usage,
-/// returns the allocation granule for ArchVGPRs.
-unsigned getArchVGPRAllocGranule();
-
 /// \returns Total number of VGPRs for given subtarget \p STI.
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);

--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
@@ -17,7 +17,7 @@ machineFunctionInfo:
  isEntryFunction: true
 body:             |
  ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_region_1: IsSSA, NoPHIs, TracksLiveness
-  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
+  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
  ; DEBUG-NEXT: ********** MI Scheduling **********
  ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_region_1:%bb.2
  ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -89,7 +89,7 @@ machineFunctionInfo:
  isEntryFunction: true
 body:             |
  ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_regions_2: IsSSA, NoPHIs, TracksLiveness
-  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
+  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
  ; DEBUG-NEXT: ********** MI Scheduling **********
  ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_regions_2:%bb.2
  ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -506,8 +506,8 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, 0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX908-NEXT:    s_mov_b32 s0, 16
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX908-NEXT:  .LBB2_1: ; %for.cond.preheader
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -566,6 +566,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX90A-LABEL: test_mfma_loop_non_splat:
 ; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 1.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
@@ -599,7 +600,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX90A-NEXT:  .LBB2_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -626,6 +626,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX942-LABEL: test_mfma_loop_non_splat:
 ; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 1.0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
@@ -659,7 +660,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-NEXT:  .LBB2_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1