Files
clang-p2996/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
Petar Avramovic c07e1e390c AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (#124298)
Record all uses outside cycle with divergent exit during
propagateTemporalDivergence in Uniformity analysis.
With this list of candidates for temporal divergence lowering,
excluding known lane masks from control flow intrinsics,
find sources from inside the cycle that are not i1 and uniform.
Temporal divergence lowering (non i1):
create copy(v_mov) to vgpr, with implicit exec (to stop other
passes from moving this copy outside of the cycle) and use this
vgpr outside of the cycle instead of original uniform source.
2025-03-12 12:09:37 +01:00

296 lines
10 KiB
C++

//===-- AMDGPURegBankSelect.cpp -------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// Assign register banks to all register operands of G_ instructions using
/// machine uniformity analysis.
/// Sgpr - uniform values and some lane masks
/// Vgpr - divergent, non S1, values
/// Vcc - divergent S1 values(lane masks)
/// However in some cases G_ instructions with this register bank assignment
/// can't be inst-selected. This is solved in AMDGPURegBankLegalize.
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/InitializePasses.h"
#define DEBUG_TYPE "amdgpu-regbankselect"
using namespace llvm;
using namespace AMDGPU;
namespace {
class AMDGPURegBankSelect : public MachineFunctionPass {
public:
static char ID;
AMDGPURegBankSelect() : MachineFunctionPass(ID) {
initializeAMDGPURegBankSelectPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override {
return "AMDGPU Register Bank Select";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
AU.addRequired<GISelCSEAnalysisWrapperPass>();
AU.addRequired<MachineUniformityAnalysisPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
// This pass assigns register banks to all virtual registers, and we maintain
// this property in subsequent passes
MachineFunctionProperties getSetProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::RegBankSelected);
}
};
} // End anonymous namespace.
INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, DEBUG_TYPE,
"AMDGPU Register Bank Select", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
INITIALIZE_PASS_END(AMDGPURegBankSelect, DEBUG_TYPE,
"AMDGPU Register Bank Select", false, false)
char AMDGPURegBankSelect::ID = 0;
char &llvm::AMDGPURegBankSelectID = AMDGPURegBankSelect::ID;
FunctionPass *llvm::createAMDGPURegBankSelectPass() {
return new AMDGPURegBankSelect();
}
class RegBankSelectHelper {
MachineIRBuilder &B;
MachineRegisterInfo &MRI;
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
const MachineUniformityInfo &MUI;
const SIRegisterInfo &TRI;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;
public:
RegBankSelectHelper(MachineIRBuilder &B,
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
const MachineUniformityInfo &MUI,
const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
// the cycle
// Note: uniformity analysis does not consider that registers with vgpr def
// are divergent (you can have uniform value in vgpr).
// - TODO: implicit use of $exec could be implemented as indicator that
// instruction is divergent
bool isTemporalDivergenceCopy(Register Reg) {
MachineInstr *MI = MRI.getVRegDef(Reg);
if (!MI->isCopy() || MI->getNumImplicitOperands() != 1)
return false;
return MI->implicit_operands().begin()->getReg() == TRI.getExec();
}
const RegisterBank *getRegBankToAssign(Register Reg) {
if (!isTemporalDivergenceCopy(Reg) &&
(MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg)))
return SgprRB;
if (MRI.getType(Reg) == LLT::scalar(1))
return VccRB;
return VgprRB;
}
// %rc:RegClass(s32) = G_ ...
// ...
// %a = G_ ..., %rc
// ->
// %rb:RegBank(s32) = G_ ...
// %rc:RegClass(s32) = COPY %rb
// ...
// %a = G_ ..., %rb
void reAssignRegBankOnDef(MachineInstr &MI, MachineOperand &DefOP,
const RegisterBank *RB) {
// Register that already has Register class got it during pre-inst selection
// of another instruction. Maybe cross bank copy was required so we insert a
// copy that can be removed later. This simplifies post regbanklegalize
// combiner and avoids need to special case some patterns.
Register Reg = DefOP.getReg();
LLT Ty = MRI.getType(Reg);
Register NewReg = MRI.createVirtualRegister({RB, Ty});
DefOP.setReg(NewReg);
auto &MBB = *MI.getParent();
B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator())));
B.buildCopy(Reg, NewReg);
// The problem was discovered for uniform S1 that was used as both
// lane mask(vcc) and regular sgpr S1.
// - lane-mask(vcc) use was by si_if, this use is divergent and requires
// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets
// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask.
// - the regular sgpr S1(uniform) instruction is now broken since
// it uses sreg_64_xexec(S1) which is divergent.
// Replace virtual registers with register class on generic instructions
// uses with virtual registers with register bank.
for (auto &UseMI : make_early_inc_range(MRI.use_instructions(Reg))) {
if (UseMI.isPreISelOpcode()) {
for (MachineOperand &Op : UseMI.operands()) {
if (Op.isReg() && Op.getReg() == Reg)
Op.setReg(NewReg);
}
}
}
}
// %a = G_ ..., %rc
// ->
// %rb:RegBank(s32) = COPY %rc
// %a = G_ ..., %rb
void constrainRegBankUse(MachineInstr &MI, MachineOperand &UseOP,
const RegisterBank *RB) {
Register Reg = UseOP.getReg();
LLT Ty = MRI.getType(Reg);
Register NewReg = MRI.createVirtualRegister({RB, Ty});
UseOP.setReg(NewReg);
if (MI.isPHI()) {
auto DefMI = MRI.getVRegDef(Reg)->getIterator();
MachineBasicBlock *DefMBB = DefMI->getParent();
B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
} else {
B.setInstr(MI);
}
B.buildCopy(NewReg, Reg);
}
};
static Register getVReg(MachineOperand &Op) {
if (!Op.isReg())
return {};
// Operands of COPY and G_SI_CALL can be physical registers.
Register Reg = Op.getReg();
if (!Reg.isVirtual())
return {};
return Reg;
}
bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
if (MF.getProperties().hasProperty(
MachineFunctionProperties::Property::FailedISel))
return false;
// Setup the instruction builder with CSE.
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
GISelCSEAnalysisWrapper &Wrapper =
getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
GISelCSEInfo &CSEInfo = Wrapper.get(TPC.getCSEConfig());
GISelObserverWrapper Observer;
Observer.addObserver(&CSEInfo);
CSEMIRBuilder B(MF);
B.setCSEInfo(&CSEInfo);
B.setChangeObserver(Observer);
RAIIDelegateInstaller DelegateInstaller(MF, &Observer);
RAIIMFObserverInstaller MFObserverInstaller(MF, Observer);
IntrinsicLaneMaskAnalyzer ILMA(MF);
MachineUniformityInfo &MUI =
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
MachineRegisterInfo &MRI = *B.getMRI();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(),
*ST.getRegBankInfo());
// Virtual registers at this point don't have register banks.
// Virtual registers in def and use operands of already inst-selected
// instruction have register class.
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
// Vregs in def and use operands of COPY can have either register class
// or bank. If there is neither on vreg in def operand, assign bank.
if (MI.isCopy()) {
Register DefReg = getVReg(MI.getOperand(0));
if (!DefReg.isValid() || MRI.getRegClassOrNull(DefReg))
continue;
assert(!MRI.getRegBankOrNull(DefReg));
MRI.setRegBank(DefReg, *RBSHelper.getRegBankToAssign(DefReg));
continue;
}
if (!MI.isPreISelOpcode())
continue;
// Vregs in def and use operands of G_ instructions need to have register
// banks assigned. Before this loop possible case are
// - (1) vreg without register class or bank in def or use operand
// - (2) vreg with register class in def operand
// - (3) vreg, defined by G_ instruction, in use operand
// - (4) vreg, defined by pre-inst-selected instruction, in use operand
// First three cases are handled in loop through all def operands of G_
// instructions. For case (1) simply setRegBank. Cases (2) and (3) are
// handled by reAssignRegBankOnDef.
for (MachineOperand &DefOP : MI.defs()) {
Register DefReg = getVReg(DefOP);
if (!DefReg.isValid())
continue;
const RegisterBank *RB = RBSHelper.getRegBankToAssign(DefReg);
if (MRI.getRegClassOrNull(DefReg))
RBSHelper.reAssignRegBankOnDef(MI, DefOP, RB);
else {
assert(!MRI.getRegBankOrNull(DefReg));
MRI.setRegBank(DefReg, *RB);
}
}
// Register bank select doesn't modify pre-inst-selected instructions.
// For case (4) need to insert a copy, handled by constrainRegBankUse.
for (MachineOperand &UseOP : MI.uses()) {
Register UseReg = getVReg(UseOP);
if (!UseReg.isValid())
continue;
// Skip case (3).
if (!MRI.getRegClassOrNull(UseReg) ||
MRI.getVRegDef(UseReg)->isPreISelOpcode())
continue;
// Use with register class defined by pre-inst-selected instruction.
const RegisterBank *RB = RBSHelper.getRegBankToAssign(UseReg);
RBSHelper.constrainRegBankUse(MI, UseOP, RB);
}
}
}
return true;
}