Revert "[AMDGPU][Scheduler] Refactor ArchVGPR rematerialization during scheduling (#125885)" (#139341)

And related "[AMDGPU] Regenerate mfma-loop.ll test"

Introduce memory error detected by Asan #125885.

This reverts commit 382a085a95.
This reverts commit 067caaafb5.
This commit is contained in:
Vitaly Buka
2025-05-09 17:51:46 -07:00
committed by GitHub
parent 436504c3b9
commit b35f6e26a5
13 changed files with 679 additions and 4826 deletions

View File

@@ -23,7 +23,6 @@
#include "llvm/ADT/iterator_range.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/RegisterBank.h"
@@ -586,9 +585,6 @@ public:
/// multiple uses.
bool hasOneNonDBGUser(Register RegNo) const;
/// If the register has a single non-Debug instruction using the specified
/// register, returns it; otherwise returns nullptr.
MachineInstr *getOneNonDBGUser(Register RegNo) const;
/// hasAtMostUses - Return true if the given register has at most \p MaxUsers
/// non-debug user instructions.

View File

@@ -432,11 +432,6 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
return hasSingleElement(use_nodbg_instructions(RegNo));
}
MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
}
bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
unsigned MaxUsers) const {
return hasNItemsOrLess(use_instr_nodbg_begin(Reg), use_instr_nodbg_end(),

View File

@@ -53,20 +53,11 @@ struct GCNRegPressure {
/// UnifiedVGPRFile
unsigned getVGPRNum(bool UnifiedVGPRFile) const {
if (UnifiedVGPRFile) {
return Value[AGPR32] ? getUnifiedVGPRNum(Value[VGPR32], Value[AGPR32])
return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32]
: Value[VGPR32] + Value[AGPR32];
}
return std::max(Value[VGPR32], Value[AGPR32]);
}
/// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
/// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
unsigned NumAGPRs) {
return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
NumAGPRs;
}
/// \returns the ArchVGPR32 pressure
unsigned getArchVGPRNum() const { return Value[VGPR32]; }
/// \returns the AccVGPR32 pressure

View File

@@ -25,13 +25,8 @@
#include "GCNSchedStrategy.h"
#include "AMDGPUIGroupLP.h"
#include "GCNRegPressure.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/Support/ErrorHandling.h"
#define DEBUG_TYPE "machine-scheduler"
@@ -306,11 +301,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
HasHighPressure = true;
if (SGPRDelta > VGPRDelta) {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
} else {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
}
}
@@ -323,7 +318,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
const RegPressureTracker &RPTracker,
SchedCandidate &Cand,
bool IsBottomUp) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
@@ -419,7 +414,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
/*IsBottomUp=*/false);
assert(TCand.SU == TopCand.SU &&
"Last pick result should correspond to re-picking right now");
"Last pick result should correspond to re-picking right now");
}
#endif
}
@@ -895,13 +890,13 @@ GCNScheduleDAGMILive::getRegionLiveInMap() const {
std::vector<MachineInstr *> RegionFirstMIs;
RegionFirstMIs.reserve(Regions.size());
auto I = Regions.rbegin(), E = Regions.rend();
auto *BB = I->first->getParent();
do {
const MachineBasicBlock *MBB = I->first->getParent();
auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
RegionFirstMIs.push_back(MI);
do {
++I;
} while (I != E && I->first->getParent() == MBB);
} while (I != E && I->first->getParent() == BB);
} while (I != E);
return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
}
@@ -1086,46 +1081,31 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
return true;
}
/// Allows to easily filter for this stage's debug output.
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
// regions inbetween the defs and region we sinked the def to. Will need to be
// fixed if there is another pass after this pass.
if (!GCNSchedStage::initGCNSchedStage())
return false;
if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
return false;
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
// Rematerialization will not help if occupancy is not limited by reg usage.
if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
return false;
// FIXME: This pass will invalidate cached MBBLiveIns for regions
// inbetween the defs and region we sinked the def to. Cached pressure
// for regions where a def is sinked from will also be invalidated. Will
// need to be fixed if there is another pass after this pass.
assert(!S.hasNextStage());
if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
DAG.Regions.size() == 1)
collectRematerializableInstructions();
if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
return false;
// Before performing any IR modification record the parent region of each MI
// and the parent MBB of each region.
const unsigned NumRegions = DAG.Regions.size();
RegionBB.reserve(NumRegions);
for (unsigned I = 0; I < NumRegions; ++I) {
RegionBoundaries Region = DAG.Regions[I];
for (auto MI = Region.first; MI != Region.second; ++MI)
MIRegion.insert({&*MI, I});
RegionBB.push_back(Region.first->getParent());
}
if (!canIncreaseOccupancyOrReduceSpill())
return false;
// Rematerialize identified instructions and update scheduler's state.
rematerialize();
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
REMAT_DEBUG(
dbgs() << "Retrying function scheduling with new min. occupancy of "
<< AchievedOcc << " from rematerializing (original was "
<< DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
if (AchievedOcc > DAG.MinOccupancy) {
DAG.MinOccupancy = AchievedOcc;
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
MFI.increaseOccupancy(MF, DAG.MinOccupancy);
}
LLVM_DEBUG(
dbgs() << "Retrying function scheduling with improved occupancy of "
<< DAG.MinOccupancy << " from rematerializing\n");
return true;
}
@@ -1513,7 +1493,8 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
dbgs()
<< "\n\t *** In shouldRevertScheduling ***\n"
<< " *********** BEFORE UnclusteredHighRPStage ***********\n");
ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
ScheduleMetrics MBefore =
getScheduleMetrics(DAG.SUnits);
LLVM_DEBUG(
dbgs()
<< "\n *********** AFTER UnclusteredHighRPStage ***********\n");
@@ -1546,9 +1527,13 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
}
bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
mayCauseSpilling(WavesAfter) ||
(IncreaseOccupancy && WavesAfter < TargetOcc);
if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
return true;
if (mayCauseSpilling(WavesAfter))
return true;
return false;
}
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1698,407 +1683,160 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
return true;
}
namespace {
/// Models excess register pressure in a region and tracks our progress as we
/// identify rematerialization opportunities.
struct ExcessRP {
/// Number of excess ArchVGPRs.
unsigned ArchVGPRs = 0;
/// Number of excess AGPRs.
unsigned AGPRs = 0;
/// For unified register files, number of excess VGPRs.
unsigned VGPRs = 0;
/// For unified register files with AGPR usage, number of excess ArchVGPRs to
/// save before we are able to save a whole allocation granule.
unsigned ArchVGPRsToAlignment = 0;
/// Whether the region uses AGPRs.
bool HasAGPRs = false;
/// Whether the subtarget has a unified RF.
bool UnifiedRF;
/// Constructs the excess RP model; determines the excess pressure w.r.t. a
/// maximum number of allowed VGPRs.
ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
/// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
/// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
/// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
/// saving these ArchVGPRs helped reduce excess pressure.
bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
/// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
/// these ArchVGPRs helped reduce excess pressure.
bool saveAGPRs(unsigned NumRegs);
/// Returns whether there is any excess register pressure.
operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
<< Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
<< Excess.ArchVGPRsToAlignment << " registers)\n";
return OS;
}
#endif
private:
static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) {
unsigned NumSaved = std::min(LeftToSave, NumRegs);
NumRegs -= NumSaved;
LeftToSave -= NumSaved;
return NumSaved;
}
};
} // namespace
ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
unsigned MaxVGPRs)
: UnifiedRF(ST.hasGFX90AInsts()) {
unsigned NumArchVGPRs = RP.getArchVGPRNum();
unsigned NumAGPRs = RP.getAGPRNum();
HasAGPRs = NumAGPRs;
if (!UnifiedRF) {
// Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
// independently.
if (NumArchVGPRs > MaxVGPRs)
ArchVGPRs = NumArchVGPRs - MaxVGPRs;
if (NumAGPRs > MaxVGPRs)
AGPRs = NumAGPRs - MaxVGPRs;
return;
}
// Independently of whether overall VGPR pressure is under the limit, we still
// have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
// number of addressable registers in each category.
const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
if (NumArchVGPRs > MaxArchVGPRs) {
ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
NumArchVGPRs = MaxArchVGPRs;
}
if (NumAGPRs > MaxArchVGPRs) {
AGPRs = NumAGPRs - MaxArchVGPRs;
NumAGPRs = MaxArchVGPRs;
}
// Check overall VGPR usage against the limit; any excess above addressable
// register limits has already been accounted for.
const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
if (NumVGPRs > MaxVGPRs) {
VGPRs = NumVGPRs - MaxVGPRs;
ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, Granule);
if (!ArchVGPRsToAlignment)
ArchVGPRsToAlignment = Granule;
}
}
bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
bool Progress = saveRegs(ArchVGPRs, NumRegs);
if (!NumRegs)
return Progress;
if (!UnifiedRF) {
if (UseArchVGPRForAGPRSpill)
Progress |= saveRegs(AGPRs, NumRegs);
} else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) {
// There is progress as long as there are VGPRs left to save, even if the
// save induced by this particular call does not cross an ArchVGPR alignment
// barrier.
Progress = true;
// ArchVGPRs can only be allocated as a multiple of a granule in unified RF.
unsigned NumSavedRegs = 0;
// Count the number of whole ArchVGPR allocation granules we can save.
const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
NumSavedRegs = NumGranules * Granule;
NumRegs -= NumSavedRegs;
}
// We may be able to save one more whole ArchVGPR allocation granule.
if (NumRegs >= ArchVGPRsToAlignment) {
NumSavedRegs += Granule;
ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
} else {
ArchVGPRsToAlignment -= NumRegs;
}
// Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
// spilling and have some free ArchVGPR slots.
saveRegs(VGPRs, NumSavedRegs);
if (UseArchVGPRForAGPRSpill)
saveRegs(AGPRs, NumSavedRegs);
} else {
// No AGPR usage in the region i.e., no allocation granule to worry about.
Progress |= saveRegs(VGPRs, NumRegs);
}
return Progress;
}
bool ExcessRP::saveAGPRs(unsigned NumRegs) {
return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
}
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
void PreRARematStage::collectRematerializableInstructions() {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
Register Reg = Register::index2VirtReg(I);
if (!DAG.LIS->hasInterval(Reg))
continue;
REMAT_DEBUG({
dbgs() << "Collecting rematerializable instructions in ";
MF.getFunction().printAsOperand(dbgs(), false);
dbgs() << '\n';
});
// TODO: Handle AGPR and SGPR rematerialization
if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
!DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
continue;
// Maps optimizable regions (i.e., regions at minimum and VGPR-limited
// occupancy, or regions with VGPR spilling) to a model of their excess RP.
DenseMap<unsigned, ExcessRP> OptRegions;
const Function &F = MF.getFunction();
MachineOperand *Op = DAG.MRI.getOneDef(Reg);
MachineInstr *Def = Op->getParent();
if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
continue;
std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
const unsigned MaxSGPRsIncOcc =
ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
if (Def->getParent() == UseI->getParent())
continue;
auto ClearOptRegionsIf = [&](bool Cond) -> bool {
if (Cond) {
// We won't try to increase occupancy.
IncreaseOccupancy = false;
OptRegions.clear();
}
return Cond;
};
// Collect optimizable regions. If there is spilling in any region we will
// just try to reduce ArchVGPR spilling. Otherwise we will try to increase
// occupancy by one in the whole function.
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
GCNRegPressure &RP = DAG.Pressure[I];
// Check whether SGPR pressures prevents us from eliminating spilling.
unsigned NumSGPRs = RP.getSGPRNum();
if (NumSGPRs > MaxSGPRsNoSpill)
ClearOptRegionsIf(IncreaseOccupancy);
ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
if (Excess) {
ClearOptRegionsIf(IncreaseOccupancy);
} else if (IncreaseOccupancy) {
// Check whether SGPR pressure prevents us from increasing occupancy.
if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
if (DAG.MinOccupancy >= WavesPerEU.first)
return false;
continue;
bool HasRematDependency = false;
// Check if this instruction uses any registers that are planned to be
// rematerialized
for (auto &RematEntry : RematerializableInsts) {
if (find_if(RematEntry.second,
[&Def](std::pair<MachineInstr *, MachineInstr *> &Remat) {
for (MachineOperand &MO : Def->operands()) {
if (!MO.isReg())
continue;
if (MO.getReg() == Remat.first->getOperand(0).getReg())
return true;
}
return false;
}) != RematEntry.second.end()) {
HasRematDependency = true;
break;
}
if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
// We can only rematerialize ArchVGPRs at this point.
unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
if (DAG.MinOccupancy >= WavesPerEU.first)
return false;
continue;
}
// Do not rematerialize an instruction if it uses an instruction that we
// have designated for rematerialization.
// FIXME: Allow for rematerialization chains: this requires 1. updating
// remat points to account for uses that are rematerialized, and 2. either
// rematerializing the candidates in careful ordering, or deferring the MBB
// RP walk until the entire chain has been rematerialized.
if (HasRematDependency)
continue;
// Similarly, check if the UseI is planned to be remat.
for (auto &RematEntry : RematerializableInsts) {
if (find_if(RematEntry.second,
[&UseI](std::pair<MachineInstr *, MachineInstr *> &Remat) {
return Remat.first == UseI;
}) != RematEntry.second.end()) {
HasRematDependency = true;
break;
}
}
if (HasRematDependency)
break;
// We are only collecting defs that are defined in another block and are
// live-through or used inside regions at MinOccupancy. This means that the
// register must be in the live-in set for the region.
bool AddedToRematList = false;
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
auto It = DAG.LiveIns[I].find(Reg);
if (It != DAG.LiveIns[I].end() && !It->second.none()) {
if (DAG.RegionsWithMinOcc[I]) {
SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
SlotIndex UseIdx =
DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
RematerializableInsts[I][Def] = UseI;
AddedToRematList = true;
}
}
// Collect regions with rematerializable reg as live-in to avoid
// searching later when updating RP.
RematDefToLiveInRegions[Def].push_back(I);
}
}
if (Excess)
OptRegions.insert({I, Excess});
if (!AddedToRematList)
RematDefToLiveInRegions.erase(Def);
}
if (OptRegions.empty())
return false;
#ifndef NDEBUG
if (IncreaseOccupancy)
REMAT_DEBUG(dbgs() << "Occupancy minimal in regions:\n");
else
REMAT_DEBUG(dbgs() << "Spilling in regions:\n");
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
REMAT_DEBUG(dbgs() << " " << I << ": " << OptIt->getSecond() << '\n');
}
#endif
// When we are reducing spilling, the target is the minimum target number of
// waves/EU determined by the subtarget.
TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
// Accounts for a reduction in RP in an optimizable region. Returns whether we
// estimate that we have identified enough rematerialization opportunities to
// achieve our goal, and sets Progress to true when this particular reduction
// in pressure was helpful toward that goal.
auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
bool &Progress) -> bool {
ExcessRP &Excess = OptIt->getSecond();
// We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
// only when we are just trying to eliminate spilling to memory. At this
// point we err on the conservative side and do not increase
// register-to-register spilling for the sake of increasing occupancy.
Progress |=
Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
/*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
if (!Excess)
OptRegions.erase(OptIt->getFirst());
return OptRegions.empty();
};
// We need up-to-date live-out info. to query live-out register masks in
// regions containing rematerializable instructions.
DAG.RegionLiveOuts.buildLiveRegMap();
// Cache set of registers that are going to be rematerialized.
DenseSet<unsigned> RematRegs;
// Identify rematerializable instructions in the function.
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
auto Region = DAG.Regions[I];
for (auto MI = Region.first; MI != Region.second; ++MI) {
// The instruction must be trivially rematerializable.
MachineInstr &DefMI = *MI;
if (!isTriviallyReMaterializable(DefMI))
continue;
// We only support rematerializing virtual VGPRs with one definition.
Register Reg = DefMI.getOperand(0).getReg();
if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
!DAG.MRI.hasOneDef(Reg))
continue;
// We only care to rematerialize the instruction if it has a single
// non-debug user in a different region. The using MI may not belong to a
// region if it is a lone region terminator.
MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
if (!UseMI)
continue;
auto UseRegion = MIRegion.find(UseMI);
if (UseRegion != MIRegion.end() && UseRegion->second == I)
continue;
// Do not rematerialize an instruction if it uses or is used by an
// instruction that we have designated for rematerialization.
// FIXME: Allow for rematerialization chains: this requires 1. updating
// remat points to account for uses that are rematerialized, and 2. either
// rematerializing the candidates in careful ordering, or deferring the
// MBB RP walk until the entire chain has been rematerialized.
if (Rematerializations.contains(UseMI) ||
llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
return MO.isReg() && RematRegs.contains(MO.getReg());
}))
continue;
// Do not rematerialize an instruction it it uses registers that aren't
// available at its use. This ensures that we are not extending any live
// range while rematerializing.
SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
continue;
REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
RematInstruction &Remat =
Rematerializations.try_emplace(&DefMI, UseMI).first->second;
bool RematUseful = false;
if (auto It = OptRegions.find(I); It != OptRegions.end()) {
// Optimistically consider that moving the instruction out of its
// defining region will reduce RP in the latter; this assumes that
// maximum RP in the region is reached somewhere between the defining
// instruction and the end of the region.
REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
if (ReduceRPInRegion(It, Mask, RematUseful))
return true;
}
for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
// We are only collecting regions in which the register is a live-in
// (and may be live-through).
auto It = DAG.LiveIns[LIRegion].find(Reg);
if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
continue;
Remat.LiveInRegions.insert(LIRegion);
// Account for the reduction in RP due to the rematerialization in an
// optimizable region in which the defined register is a live-in. This
// is exact for live-through region but optimistic in the using region,
// where RP is actually reduced only if maximum RP is reached somewhere
// between the beginning of the region and the rematerializable
// instruction's use.
if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
return true;
}
}
// If the instruction is not a live-in or live-out in any optimizable
// region then there is no point in rematerializing it.
if (!RematUseful) {
Rematerializations.pop_back();
REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
} else {
RematRegs.insert(Reg);
}
}
}
if (IncreaseOccupancy) {
// We were trying to increase occupancy but failed, abort the stage.
REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
Rematerializations.clear();
return false;
}
REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
return !Rematerializations.empty();
}
void PreRARematStage::rematerialize() {
const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
const TargetInstrInfo *TII) {
// Temporary copies of cached variables we will be modifying and replacing if
// sinking succeeds.
SmallVector<
std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
NewRegions;
DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
DenseMap<unsigned, GCNRegPressure> NewPressure;
BitVector NewRescheduleRegions;
LiveIntervals *LIS = DAG.LIS;
// Collect regions whose RP changes in unpredictable way; we will have to
// fully recompute their RP after all rematerailizations.
DenseSet<unsigned> RecomputeRP;
NewRegions.resize(DAG.Regions.size());
NewRescheduleRegions.resize(DAG.Regions.size());
// Rematerialize all instructions.
for (auto &[DefMI, Remat] : Rematerializations) {
MachineBasicBlock::iterator InsertPos(Remat.UseMI);
Register Reg = DefMI->getOperand(0).getReg();
unsigned SubReg = DefMI->getOperand(0).getSubReg();
unsigned DefRegion = MIRegion.at(DefMI);
// Collect only regions that has a rematerializable def as a live-in.
SmallSet<unsigned, 16> ImpactedRegions;
for (const auto &It : RematDefToLiveInRegions)
ImpactedRegions.insert_range(It.second);
// Rematerialize DefMI to its use block.
TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
*DAG.TRI);
Remat.RematMI = &*std::prev(InsertPos);
Remat.RematMI->getOperand(0).setSubReg(SubReg);
DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
// Make copies of register pressure and live-ins cache that will be updated
// as we rematerialize.
for (auto Idx : ImpactedRegions) {
NewPressure[Idx] = DAG.Pressure[Idx];
NewLiveIns[Idx] = DAG.LiveIns[Idx];
}
NewRegions = DAG.Regions;
NewRescheduleRegions.reset();
// Update region boundaries in regions we sinked from (remove defining MI)
// and to (insert MI rematerialized in use block). Only then we can erase
// the original MI.
DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
auto UseRegion = MIRegion.find(Remat.UseMI);
if (UseRegion != MIRegion.end()) {
DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
Remat.RematMI);
DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
bool Improved = false;
for (auto I : ImpactedRegions) {
if (!DAG.RegionsWithMinOcc[I])
continue;
Improved = false;
int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
int SGPRUsage = NewPressure[I].getSGPRNum();
// TODO: Handle occupancy drop due to AGPR and SGPR.
// Check if cause of occupancy drop is due to VGPR usage and not SGPR.
if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
break;
// The occupancy of this region could have been improved by a previous
// iteration's sinking of defs.
if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
NewRescheduleRegions[I] = true;
Improved = true;
continue;
}
DefMI->eraseFromParent();
DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
// Collect all regions impacted by the rematerialization and update their
// live-in/RP information.
for (unsigned I : Remat.LiveInRegions) {
ImpactedRegions.insert({I, DAG.Pressure[I]});
GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
// First check if we have enough trivially rematerializable instructions to
// improve occupancy. Optimistically assume all instructions we are able to
// sink decreased RP.
int TotalSinkableRegs = 0;
for (const auto &It : RematerializableInsts[I]) {
MachineInstr *Def = It.first;
Register DefReg = Def->getOperand(0).getReg();
TotalSinkableRegs +=
SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
#ifdef EXPENSIVE_CHECKS
// All uses are known to be available / live at the remat point. Thus, the
// uses should already be live in to the region.
for (MachineOperand &MO : DefMI->operands()) {
for (MachineOperand &MO : Def->operands()) {
if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
continue;
@@ -2106,12 +1844,13 @@ void PreRARematStage::rematerialize() {
if (!UseReg.isVirtual())
continue;
LiveInterval &LI = DAG.LIS->getInterval(UseReg);
LiveInterval &LI = LIS->getInterval(UseReg);
LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
if (LI.hasSubRanges() && MO.getSubReg())
LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
assert(NewLiveIns[I].contains(UseReg));
LaneBitmask LiveInMask = NewLiveIns[I][UseReg];
LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
// If this register has lanes not covered by the LiveIns, be sure they
// do not map to any subrange. ref:
@@ -2123,64 +1862,126 @@ void PreRARematStage::rematerialize() {
}
}
#endif
// The register is no longer a live-in in all regions but the one that
// contains the single use. In live-through regions, maximum register
// pressure decreases predictably so we can directly update it. In the
// using region, maximum RP may or may not decrease, so we will mark it
// for re-computation after all materializations have taken place.
LaneBitmask PrevMask = RegionLiveIns[Reg];
RegionLiveIns.erase(Reg);
RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
else
RecomputeRP.insert(I);
}
// RP in the region from which the instruction was rematerialized may or may
// not decrease.
ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
RecomputeRP.insert(DefRegion);
int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
// If in the most optimistic scenario, we cannot improve occupancy, then do
// not attempt to sink any instructions.
if (OptimisticOccupancy <= DAG.MinOccupancy)
break;
// Recompute live interval to reflect the register's rematerialization.
Register RematReg = Remat.RematMI->getOperand(0).getReg();
DAG.LIS->removeInterval(RematReg);
DAG.LIS->createAndComputeVirtRegInterval(RematReg);
}
unsigned ImproveOccupancy = 0;
SmallVector<MachineInstr *, 4> SinkedDefs;
for (auto &It : RematerializableInsts[I]) {
MachineInstr *Def = It.first;
MachineBasicBlock::iterator InsertPos =
MachineBasicBlock::iterator(It.second);
Register Reg = Def->getOperand(0).getReg();
// Rematerialize MI to its use block.
TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
MachineInstr *NewMI = &*std::prev(InsertPos);
LIS->InsertMachineInstrInMaps(*NewMI);
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
InsertedMIToOldDef[NewMI] = Def;
// All regions impacted by at least one rematerialization must be rescheduled.
// Maximum pressure must also be recomputed for all regions where it changed
// non-predictably and checked against the target occupancy.
AchievedOcc = TargetOcc;
for (auto &[I, OriginalRP] : ImpactedRegions) {
bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
DAG.RescheduleRegions[I] = !IsEmptyRegion;
if (!RecomputeRP.contains(I))
continue;
// Update region boundaries in scheduling region we sinked from since we
// may sink an instruction that was at the beginning or end of its region
DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
/*Removing =*/true);
GCNRegPressure RP;
if (IsEmptyRegion) {
RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
} else {
GCNDownwardRPTracker RPT(*DAG.LIS);
auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
DAG.Regions[I].second);
if (NonDbgMI == DAG.Regions[I].second) {
// Region is non-empty but contains only debug instructions.
RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
} else {
RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
RPT.advance(DAG.Regions[I].second);
RP = RPT.moveMaxPressure();
// Update region boundaries in region we sinked to.
DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
LaneBitmask PrevMask = NewLiveIns[I][Reg];
// FIXME: Also update cached pressure for where the def was sinked from.
// Update RP for all regions that has this reg as a live-in and remove
// the reg from all regions as a live-in.
for (auto Idx : RematDefToLiveInRegions[Def]) {
NewLiveIns[Idx].erase(Reg);
if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
// Def is live-through and not used in this block.
NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
} else {
// Def is used and rematerialized into this block.
GCNDownwardRPTracker RPT(*LIS);
auto *NonDbgMI = &*skipDebugInstructionsForward(
NewRegions[Idx].first, NewRegions[Idx].second);
RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
RPT.advance(NewRegions[Idx].second);
NewPressure[Idx] = RPT.moveMaxPressure();
}
}
SinkedDefs.push_back(Def);
ImproveOccupancy = NewPressure[I].getOccupancy(ST);
if (ImproveOccupancy > DAG.MinOccupancy)
break;
}
DAG.Pressure[I] = RP;
AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
// Remove defs we just sinked from all regions' list of sinkable defs
for (auto &Def : SinkedDefs)
for (auto TrackedIdx : RematDefToLiveInRegions[Def])
RematerializableInsts[TrackedIdx].erase(Def);
if (ImproveOccupancy <= DAG.MinOccupancy)
break;
NewRescheduleRegions[I] = true;
Improved = true;
}
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
if (!Improved) {
// Occupancy was not improved for all regions that were at MinOccupancy.
// Undo sinking and remove newly rematerialized instructions.
for (auto &Entry : InsertedMIToOldDef) {
MachineInstr *MI = Entry.first;
MachineInstr *OldMI = Entry.second;
Register Reg = MI->getOperand(0).getReg();
LIS->RemoveMachineInstrFromMaps(*MI);
MI->eraseFromParent();
OldMI->clearRegisterDeads(Reg);
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
}
return false;
}
// Occupancy was improved for all regions.
for (auto &Entry : InsertedMIToOldDef) {
MachineInstr *MI = Entry.first;
MachineInstr *OldMI = Entry.second;
// Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
DAG.BBLiveInMap.erase(OldMI);
// Remove OldMI and update LIS
Register Reg = MI->getOperand(0).getReg();
LIS->RemoveMachineInstrFromMaps(*OldMI);
OldMI->eraseFromParent();
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
}
// Update live-ins, register pressure, and regions caches.
for (auto Idx : ImpactedRegions) {
DAG.LiveIns[Idx] = NewLiveIns[Idx];
DAG.Pressure[Idx] = NewPressure[Idx];
DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
}
DAG.Regions = NewRegions;
DAG.RescheduleRegions = NewRescheduleRegions;
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
return true;
}
// Copied from MachineLICM
bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
if (!DAG.TII->isTriviallyReMaterializable(MI))
return false;
@@ -2198,83 +1999,46 @@ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
return true;
}
void PreRARematStage::finalizeGCNSchedStage() {
// We consider that reducing spilling is always beneficial so we never
// rollback rematerializations in such cases. It's also possible that
// rescheduling lowers occupancy over the one achieved just through remats, in
// which case we do not want to rollback either (the rescheduling was already
// reverted in PreRARematStage::shouldRevertScheduling in such cases).
unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
return;
REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
// Rollback the rematerializations.
for (const auto &[DefMI, Remat] : Rematerializations) {
MachineInstr &RematMI = *Remat.RematMI;
unsigned DefRegion = MIRegion.at(DefMI);
MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
MachineBasicBlock *MBB = RegionBB[DefRegion];
Register Reg = RematMI.getOperand(0).getReg();
unsigned SubReg = RematMI.getOperand(0).getSubReg();
// Re-rematerialize MI at the end of its original region. Note that it may
// not be rematerialized exactly in the same position as originally within
// the region, but it should not matter much.
TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
MachineInstr *NewMI = &*std::prev(InsertPos);
NewMI->getOperand(0).setSubReg(SubReg);
DAG.LIS->InsertMachineInstrInMaps(*NewMI);
auto UseRegion = MIRegion.find(Remat.UseMI);
if (UseRegion != MIRegion.end()) {
DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
nullptr);
}
DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
// Erase rematerialized MI.
RematMI.eraseFromParent();
DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
// Recompute live interval for the re-rematerialized register
DAG.LIS->removeInterval(Reg);
DAG.LIS->createAndComputeVirtRegInterval(Reg);
// Re-add the register as a live-in in all regions it used to be one in.
for (unsigned LIRegion : Remat.LiveInRegions)
DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
}
// Reset RP in all impacted regions.
for (auto &[I, OriginalRP] : ImpactedRegions)
DAG.Pressure[I] = OriginalRP;
GCNSchedStage::finalizeGCNSchedStage();
}
// When removing, we will have to check both beginning and ending of the region.
// When inserting, we will only have to check if we are inserting NewMI in front
// of a scheduling region and do not need to check the ending since we will only
// ever be inserting before an already existing MI.
void GCNScheduleDAGMILive::updateRegionBoundaries(
RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
MachineInstr *NewMI) {
assert(!NewMI ||
NewMI != RegionBounds.second && "cannot remove at region end");
SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>> &RegionBoundaries,
MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
unsigned I = 0, E = RegionBoundaries.size();
// Search for first region of the block where MI is located
while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
++I;
if (RegionBounds.first == RegionBounds.second) {
assert(NewMI && "cannot remove from an empty region");
RegionBounds.first = NewMI;
return;
for (; I != E; ++I) {
if (MI->getParent() != RegionBoundaries[I].first->getParent())
return;
if (Removing && MI == RegionBoundaries[I].first &&
MI == RegionBoundaries[I].second) {
// MI is in a region with size 1, after removing, the region will be
// size 0, set RegionBegin and RegionEnd to pass end of block iterator.
RegionBoundaries[I] =
std::pair(MI->getParent()->end(), MI->getParent()->end());
return;
}
if (MI == RegionBoundaries[I].first) {
if (Removing)
RegionBoundaries[I] =
std::pair(std::next(MI), RegionBoundaries[I].second);
else
// Inserted NewMI in front of region, set new RegionBegin to NewMI
RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
RegionBoundaries[I].second);
return;
}
if (Removing && MI == RegionBoundaries[I].second) {
RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
return;
}
}
// We only care for modifications at the beginning of a non-empty region since
// the upper region boundary is exclusive.
if (MI != RegionBounds.first)
return;
if (!NewMI)
RegionBounds.first = std::next(MI); // Removal
else
RegionBounds.first = NewMI; // Insertion
}
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {

View File

@@ -14,9 +14,7 @@
#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
#include "GCNRegPressure.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
@@ -216,11 +214,6 @@ public:
}
};
/// A region's boundaries i.e. a pair of instruction bundle iterators. The lower
/// boundary is inclusive, the upper boundary is exclusive.
using RegionBoundaries =
std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>;
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class GCNSchedStage;
friend class OccInitialScheduleStage;
@@ -241,7 +234,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
unsigned MinOccupancy;
// Vector of regions recorder for later rescheduling
SmallVector<RegionBoundaries, 32> Regions;
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;
// Records if a region is not yet scheduled, or schedule has been reverted,
// or we generally desire to reschedule it.
@@ -292,13 +286,12 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Compute and cache live-ins and pressure for all regions in block.
void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
/// If necessary, updates a region's boundaries following insertion ( \p NewMI
/// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
/// For an MI removal, this must be called before the MI is actually erased
/// from its parent MBB.
void updateRegionBoundaries(RegionBoundaries &RegionBounds,
MachineBasicBlock::iterator MI,
MachineInstr *NewMI);
// Update region boundaries when removing MI or inserting NewMI before MI.
void updateRegionBoundaries(
SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>> &RegionBoundaries,
MachineBasicBlock::iterator MI, MachineInstr *NewMI,
bool Removing = false);
void runSchedStages();
@@ -438,73 +431,30 @@ public:
: GCNSchedStage(StageID, DAG) {}
};
/// Attempts to reduce function spilling or, if there is no spilling, to
/// increase function occupancy by one with respect to ArchVGPR usage by sinking
/// trivially rematerializable instructions to their use. When the stage
/// estimates reducing spilling or increasing occupancy is possible, as few
/// instructions as possible are rematerialized to reduce potential negative
/// effects on function latency.
///
/// TODO: We should extend this to work on SGPRs and AGPRs as well.
class PreRARematStage : public GCNSchedStage {
private:
/// Useful information about a rematerializable instruction.
struct RematInstruction {
/// Single use of the rematerializable instruction's defined register,
/// located in a different block.
MachineInstr *UseMI;
/// Rematerialized version of \p DefMI, set in
/// PreRARematStage::rematerialize. Used for reverting rematerializations.
MachineInstr *RematMI;
/// Set of regions in which the rematerializable instruction's defined
/// register is a live-in.
SmallDenseSet<unsigned, 4> LiveInRegions;
// Each region at MinOccupancy will have their own list of trivially
// rematerializable instructions we can remat to reduce RP. The list maps an
// instruction to the position we should remat before, usually the MI using
// the rematerializable instruction.
MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
RematerializableInsts;
RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
};
// Map a trivially rematerializable def to a list of regions at MinOccupancy
// that has the defined reg as a live-in.
MapVector<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
/// Maps all MIs to their parent region. MI terminators are considered to be
/// outside the region they delimitate, and as such are not stored in the map.
DenseMap<MachineInstr *, unsigned> MIRegion;
/// Parent MBB to each region, in region order.
SmallVector<MachineBasicBlock *> RegionBB;
/// Collects instructions to rematerialize.
MapVector<MachineInstr *, RematInstruction> Rematerializations;
/// Collects regions whose live-ins or register pressure will change due to
/// rematerializations.
DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
/// In case we need to rollback rematerializations, save lane masks for all
/// rematerialized registers in all regions in which they are live-ins.
DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
/// Target occupancy the stage estimates is reachable through
/// rematerialization. Greater than or equal to the pre-stage min occupancy.
unsigned TargetOcc;
/// Achieved occupancy *only* through rematerializations (pre-rescheduling).
/// Smaller than or equal to the target occupancy.
unsigned AchievedOcc;
/// Whether the stage is attempting to increase occupancy in the abscence of
/// spilling.
bool IncreaseOccupancy;
// Collect all trivially rematerializable VGPR instructions with a single def
// and single use outside the defining block into RematerializableInsts.
void collectRematerializableInstructions();
/// Returns whether remat can reduce spilling or increase function occupancy
/// by 1 through rematerialization. If it can do one, collects instructions in
/// PreRARematStage::Rematerializations and sets the target occupancy in
/// PreRARematStage::TargetOccupancy.
bool canIncreaseOccupancyOrReduceSpill();
/// Whether the MI is trivially rematerializable and does not have any virtual
/// register use.
bool isTriviallyReMaterializable(const MachineInstr &MI);
/// Rematerializes all instructions in PreRARematStage::Rematerializations
/// and stores the achieved occupancy after remat in
/// PreRARematStage::AchievedOcc.
void rematerialize();
/// If remat alone did not increase occupancy to the target one, rollbacks all
/// rematerializations and resets live-ins/RP in all regions impacted by the
/// stage to their pre-stage values.
void finalizeGCNSchedStage() override;
// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
// Attempt to reduce RP of VGPR by sinking trivially rematerializable
// instructions. Returns true if we were able to sink instruction(s).
bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
const TargetInstrInfo *TII);
/// \p Returns true if all the uses in \p InstToRemat defined at \p
/// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual

View File

@@ -466,7 +466,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
getReservedNumSGPRs(MF));
}
unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
static unsigned getMaxNumPreloadedSGPRs() {
using USI = GCNUserSGPRUsageInfo;
// Max number of user SGPRs
const unsigned MaxUserSGPRs =
@@ -497,28 +497,42 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
}
unsigned GCNSubtarget::getBaseMaxNumVGPRs(
const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
const auto &[Min, Max] = NumVGPRBounds;
const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
// Compute maximum number of VGPRs function can use using default/requested
// minimum number of waves per execution unit.
unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
// Check if maximum number of VGPRs was explicitly requested using
// "amdgpu-num-vgpr" attribute.
unsigned Requested =
F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
if (Requested != MaxNumVGPRs) {
if (hasGFX90AInsts())
Requested *= 2;
unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
if (Requested != Max && hasGFX90AInsts())
Requested *= 2;
// Make sure requested value is compatible with values implied by
// default/requested minimum/maximum number of waves per execution unit.
if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
Requested = 0;
if (WavesPerEU.second && Requested &&
Requested < getMinNumVGPRs(WavesPerEU.second))
Requested = 0;
// Make sure requested value is inside the range of possible VGPR usage.
return std::clamp(Requested, Min, Max);
if (Requested)
MaxNumVGPRs = Requested;
}
return MaxNumVGPRs;
}
unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
return getBaseMaxNumVGPRs(
F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
}
unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
return getMaxNumVGPRs(MF.getFunction());
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
}
void GCNSubtarget::adjustSchedDependency(

View File

@@ -1505,9 +1505,6 @@ public:
/// \returns Reserved number of SGPRs for given function \p F.
unsigned getReservedNumSGPRs(const Function &F) const;
/// \returns Maximum number of preloaded SGPRs for the subtarget.
unsigned getMaxNumPreloadedSGPRs() const;
/// \returns max num SGPRs. This is the common utility
/// function called by MachineFunction and Function
/// variants of getMaxNumSGPRs.
@@ -1576,10 +1573,8 @@ public:
/// \returns max num VGPRs. This is the common utility function
/// called by MachineFunction and Function variants of getMaxNumVGPRs.
unsigned
getBaseMaxNumVGPRs(const Function &F,
std::pair<unsigned, unsigned> NumVGPRBounds) const;
unsigned getBaseMaxNumVGPRs(const Function &F,
std::pair<unsigned, unsigned> WavesPerEU) const;
/// \returns Maximum number of VGPRs that meets number of waves per execution
/// unit requirement for function \p F, or number of VGPRs explicitly
/// requested using "amdgpu-num-vgpr" attribute attached to function \p F.

View File

@@ -1190,8 +1190,6 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
return IsWave32 ? 8 : 4;
}
unsigned getArchVGPRAllocGranule() { return 4; }
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 512;

View File

@@ -309,10 +309,6 @@ unsigned getVGPREncodingGranule(
const MCSubtargetInfo *STI,
std::optional<bool> EnableWavefrontSize32 = std::nullopt);
/// For subtargets with a unified VGPR file and mixed ArchVGPR/AGPR usage,
/// returns the allocation granule for ArchVGPRs.
unsigned getArchVGPRAllocGranule();
/// \returns Total number of VGPRs for given subtarget \p STI.
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);

View File

@@ -17,7 +17,7 @@ machineFunctionInfo:
isEntryFunction: true
body: |
; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_region_1: IsSSA, NoPHIs, TracksLiveness
; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
; DEBUG-NEXT: ********** MI Scheduling **********
; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_region_1:%bb.2
; DEBUG-NEXT: From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -89,7 +89,7 @@ machineFunctionInfo:
isEntryFunction: true
body: |
; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_regions_2: IsSSA, NoPHIs, TracksLiveness
; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
; DEBUG-NEXT: ********** MI Scheduling **********
; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_regions_2:%bb.2
; DEBUG-NEXT: From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0

View File

@@ -506,8 +506,8 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a3, 0
; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
; GFX908-NEXT: v_accvgpr_write_b32 a0, 0
; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
; GFX908-NEXT: s_mov_b32 s0, 16
; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -566,6 +566,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_non_splat:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
@@ -599,7 +600,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -626,6 +626,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_non_splat:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
@@ -659,7 +660,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1