The lowerKillI1 method wrongly handled cases where it inserted a new S_BRANCH instruction when the kill was not the only terminator, and then tried to split the block. `SI_KILL_I1_TERMINATOR -1,0` doesn't have any effect. Instead of lowering to an unconditional branch, we remove the instruction and insert an unconditional branch only if the instruction is the last terminator. No split is needed in this case (if the last terminator has been reached, then the whole block was processed). Also stop generating an unconditional branch in splitBlock: this branch was redundant since TermMI is promoted to a terminator that fallsthrough to the next block already. Solves SWDEV-508819
1851 lines
62 KiB
C++
1851 lines
62 KiB
C++
//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// This pass adds instructions to enable whole quad mode (strict or non-strict)
|
|
/// for pixel shaders, and strict whole wavefront mode for all programs.
|
|
///
|
|
/// The "strict" prefix indicates that inactive lanes do not take part in
|
|
/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
|
|
/// always be enabled irrespective of control flow decisions. Conversely in
|
|
/// non-strict WQM inactive lanes may control flow decisions.
|
|
///
|
|
/// Whole quad mode is required for derivative computations, but it interferes
|
|
/// with shader side effects (stores and atomics). It ensures that WQM is
|
|
/// enabled when necessary, but disabled around stores and atomics.
|
|
///
|
|
/// When necessary, this pass creates a function prolog
|
|
///
|
|
/// S_MOV_B64 LiveMask, EXEC
|
|
/// S_WQM_B64 EXEC, EXEC
|
|
///
|
|
/// to enter WQM at the top of the function and surrounds blocks of Exact
|
|
/// instructions by
|
|
///
|
|
/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
|
|
/// ...
|
|
/// S_MOV_B64 EXEC, Tmp
|
|
///
|
|
/// We also compute when a sequence of instructions requires strict whole
|
|
/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
|
|
///
|
|
/// S_OR_SAVEEXEC_B64 Tmp, -1
|
|
/// ...
|
|
/// S_MOV_B64 EXEC, Tmp
|
|
///
|
|
/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
|
|
/// we use a similar save and restore mechanism and force whole quad mode for
|
|
/// those instructions:
|
|
///
|
|
/// S_MOV_B64 Tmp, EXEC
|
|
/// S_WQM_B64 EXEC, EXEC
|
|
/// ...
|
|
/// S_MOV_B64 EXEC, Tmp
|
|
///
|
|
/// In order to avoid excessive switching during sequences of Exact
|
|
/// instructions, the pass first analyzes which instructions must be run in WQM
|
|
/// (aka which instructions produce values that lead to derivative
|
|
/// computations).
|
|
///
|
|
/// Basic blocks are always exited in WQM as long as some successor needs WQM.
|
|
///
|
|
/// There is room for improvement given better control flow analysis:
|
|
///
|
|
/// (1) at the top level (outside of control flow statements, and as long as
|
|
/// kill hasn't been used), one SGPR can be saved by recovering WQM from
|
|
/// the LiveMask (this is implemented for the entry block).
|
|
///
|
|
/// (2) when entire regions (e.g. if-else blocks or entire loops) only
|
|
/// consist of exact and don't-care instructions, the switch only has to
|
|
/// be done at the entry and exit points rather than potentially in each
|
|
/// block of the region.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "SIWholeQuadMode.h"
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
#include "llvm/ADT/MapVector.h"
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
|
#include "llvm/CodeGen/LiveIntervals.h"
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
#include "llvm/CodeGen/MachineDominators.h"
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
|
#include "llvm/CodeGen/MachinePostDominators.h"
|
|
#include "llvm/IR/CallingConv.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "si-wqm"
|
|
|
|
namespace {
|
|
|
|
enum {
|
|
StateWQM = 0x1,
|
|
StateStrictWWM = 0x2,
|
|
StateStrictWQM = 0x4,
|
|
StateExact = 0x8,
|
|
StateStrict = StateStrictWWM | StateStrictWQM,
|
|
};
|
|
|
|
struct PrintState {
|
|
public:
|
|
int State;
|
|
|
|
explicit PrintState(int State) : State(State) {}
|
|
};
|
|
|
|
#ifndef NDEBUG
|
|
static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
|
|
|
|
static const std::pair<char, const char *> Mapping[] = {
|
|
std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
|
|
std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
|
|
char State = PS.State;
|
|
for (auto M : Mapping) {
|
|
if (State & M.first) {
|
|
OS << M.second;
|
|
State &= ~M.first;
|
|
|
|
if (State)
|
|
OS << '|';
|
|
}
|
|
}
|
|
assert(State == 0);
|
|
return OS;
|
|
}
|
|
#endif
|
|
|
|
struct InstrInfo {
|
|
char Needs = 0;
|
|
char Disabled = 0;
|
|
char OutNeeds = 0;
|
|
char MarkedStates = 0;
|
|
};
|
|
|
|
struct BlockInfo {
|
|
char Needs = 0;
|
|
char InNeeds = 0;
|
|
char OutNeeds = 0;
|
|
char InitialState = 0;
|
|
bool NeedsLowering = false;
|
|
};
|
|
|
|
struct WorkItem {
|
|
MachineBasicBlock *MBB = nullptr;
|
|
MachineInstr *MI = nullptr;
|
|
|
|
WorkItem() = default;
|
|
WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
|
|
WorkItem(MachineInstr *MI) : MI(MI) {}
|
|
};
|
|
|
|
class SIWholeQuadMode {
|
|
public:
|
|
SIWholeQuadMode(MachineFunction &MF, LiveIntervals *LIS,
|
|
MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
|
|
: ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
|
|
TRI(&TII->getRegisterInfo()), MRI(&MF.getRegInfo()), LIS(LIS), MDT(MDT),
|
|
PDT(PDT) {}
|
|
bool run(MachineFunction &MF);
|
|
|
|
private:
|
|
const GCNSubtarget *ST;
|
|
const SIInstrInfo *TII;
|
|
const SIRegisterInfo *TRI;
|
|
MachineRegisterInfo *MRI;
|
|
LiveIntervals *LIS;
|
|
MachineDominatorTree *MDT;
|
|
MachinePostDominatorTree *PDT;
|
|
|
|
unsigned AndOpc;
|
|
unsigned AndTermOpc;
|
|
unsigned AndN2Opc;
|
|
unsigned XorOpc;
|
|
unsigned AndSaveExecOpc;
|
|
unsigned AndSaveExecTermOpc;
|
|
unsigned WQMOpc;
|
|
Register Exec;
|
|
Register LiveMaskReg;
|
|
|
|
DenseMap<const MachineInstr *, InstrInfo> Instructions;
|
|
MapVector<MachineBasicBlock *, BlockInfo> Blocks;
|
|
|
|
// Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
|
|
DenseMap<const MachineInstr *, char> StateTransition;
|
|
|
|
SmallVector<MachineInstr *, 2> LiveMaskQueries;
|
|
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
|
|
SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
|
|
SmallVector<MachineInstr *, 4> KillInstrs;
|
|
SmallVector<MachineInstr *, 4> InitExecInstrs;
|
|
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
|
|
|
|
void printInfo();
|
|
|
|
void markInstruction(MachineInstr &MI, char Flag,
|
|
std::vector<WorkItem> &Worklist);
|
|
void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
|
|
unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
|
|
void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
|
|
std::vector<WorkItem> &Worklist);
|
|
void markInstructionUses(const MachineInstr &MI, char Flag,
|
|
std::vector<WorkItem> &Worklist);
|
|
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
|
|
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
|
|
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
|
|
char analyzeFunction(MachineFunction &MF);
|
|
|
|
MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator Before);
|
|
MachineBasicBlock::iterator
|
|
prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
|
|
MachineBasicBlock::iterator Last, bool PreferLast,
|
|
bool SaveSCC);
|
|
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
|
Register SaveWQM);
|
|
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
|
Register SavedWQM);
|
|
void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
|
Register SaveOrig, char StrictStateNeeded);
|
|
void fromStrictMode(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator Before, Register SavedOrig,
|
|
char NonStrictState, char CurrentStrictState);
|
|
|
|
void splitBlock(MachineInstr *TermMI);
|
|
MachineInstr *lowerKillI1(MachineInstr &MI, bool IsWQM);
|
|
MachineInstr *lowerKillF32(MachineInstr &MI);
|
|
|
|
void lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI);
|
|
void processBlock(MachineBasicBlock &MBB, BlockInfo &BI, bool IsEntry);
|
|
|
|
bool lowerLiveMaskQueries();
|
|
bool lowerCopyInstrs();
|
|
bool lowerKillInstrs(bool IsWQM);
|
|
void lowerInitExec(MachineInstr &MI);
|
|
MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
|
|
bool &Changed);
|
|
};
|
|
|
|
class SIWholeQuadModeLegacy : public MachineFunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
SIWholeQuadModeLegacy() : MachineFunctionPass(ID) {}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
|
|
StringRef getPassName() const override { return "SI Whole Quad Mode"; }
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<LiveIntervalsWrapperPass>();
|
|
AU.addPreserved<SlotIndexesWrapperPass>();
|
|
AU.addPreserved<LiveIntervalsWrapperPass>();
|
|
AU.addPreserved<MachineDominatorTreeWrapperPass>();
|
|
AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
|
|
MachineFunctionProperties getClearedProperties() const override {
|
|
return MachineFunctionProperties().set(
|
|
MachineFunctionProperties::Property::IsSSA);
|
|
}
|
|
};
|
|
} // end anonymous namespace
|
|
|
|
char SIWholeQuadModeLegacy::ID = 0;
|
|
|
|
INITIALIZE_PASS_BEGIN(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode",
|
|
false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
|
|
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
|
|
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
|
|
INITIALIZE_PASS_END(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode",
|
|
false, false)
|
|
|
|
char &llvm::SIWholeQuadModeID = SIWholeQuadModeLegacy::ID;
|
|
|
|
FunctionPass *llvm::createSIWholeQuadModeLegacyPass() {
|
|
return new SIWholeQuadModeLegacy;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
|
|
for (const auto &BII : Blocks) {
|
|
dbgs() << "\n"
|
|
<< printMBBReference(*BII.first) << ":\n"
|
|
<< " InNeeds = " << PrintState(BII.second.InNeeds)
|
|
<< ", Needs = " << PrintState(BII.second.Needs)
|
|
<< ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
|
|
|
|
for (const MachineInstr &MI : *BII.first) {
|
|
auto III = Instructions.find(&MI);
|
|
if (III != Instructions.end()) {
|
|
dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
|
|
<< ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
|
|
std::vector<WorkItem> &Worklist) {
|
|
InstrInfo &II = Instructions[&MI];
|
|
|
|
assert(!(Flag & StateExact) && Flag != 0);
|
|
|
|
// Capture all states requested in marking including disabled ones.
|
|
II.MarkedStates |= Flag;
|
|
|
|
// Remove any disabled states from the flag. The user that required it gets
|
|
// an undefined value in the helper lanes. For example, this can happen if
|
|
// the result of an atomic is used by instruction that requires WQM, where
|
|
// ignoring the request for WQM is correct as per the relevant specs.
|
|
Flag &= ~II.Disabled;
|
|
|
|
// Ignore if the flag is already encompassed by the existing needs, or we
|
|
// just disabled everything.
|
|
if ((II.Needs & Flag) == Flag)
|
|
return;
|
|
|
|
LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
|
|
II.Needs |= Flag;
|
|
Worklist.emplace_back(&MI);
|
|
}
|
|
|
|
/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
|
|
void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
|
|
Register Reg, unsigned SubReg, char Flag,
|
|
std::vector<WorkItem> &Worklist) {
|
|
LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
|
|
|
|
LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
|
|
const VNInfo *Value = UseLRQ.valueIn();
|
|
if (!Value)
|
|
return;
|
|
|
|
// Note: this code assumes that lane masks on AMDGPU completely
|
|
// cover registers.
|
|
const LaneBitmask UseLanes =
|
|
SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
|
|
: (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
|
|
: LaneBitmask::getNone());
|
|
|
|
// Perform a depth-first iteration of the LiveRange graph marking defs.
|
|
// Stop processing of a given branch when all use lanes have been defined.
|
|
// The first definition stops processing for a physical register.
|
|
struct PhiEntry {
|
|
const VNInfo *Phi;
|
|
unsigned PredIdx;
|
|
LaneBitmask DefinedLanes;
|
|
|
|
PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
|
|
: Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
|
|
};
|
|
using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
|
|
SmallVector<PhiEntry, 2> PhiStack;
|
|
SmallSet<VisitKey, 4> Visited;
|
|
LaneBitmask DefinedLanes;
|
|
unsigned NextPredIdx = 0; // Only used for processing phi nodes
|
|
do {
|
|
const VNInfo *NextValue = nullptr;
|
|
const VisitKey Key(Value, DefinedLanes);
|
|
|
|
if (Visited.insert(Key).second) {
|
|
// On first visit to a phi then start processing first predecessor
|
|
NextPredIdx = 0;
|
|
}
|
|
|
|
if (Value->isPHIDef()) {
|
|
// Each predecessor node in the phi must be processed as a subgraph
|
|
const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
|
|
assert(MBB && "Phi-def has no defining MBB");
|
|
|
|
// Find next predecessor to process
|
|
unsigned Idx = NextPredIdx;
|
|
const auto *PI = MBB->pred_begin() + Idx;
|
|
const auto *PE = MBB->pred_end();
|
|
for (; PI != PE && !NextValue; ++PI, ++Idx) {
|
|
if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
|
|
if (!Visited.count(VisitKey(VN, DefinedLanes)))
|
|
NextValue = VN;
|
|
}
|
|
}
|
|
|
|
// If there are more predecessors to process; add phi to stack
|
|
if (PI != PE)
|
|
PhiStack.emplace_back(Value, Idx, DefinedLanes);
|
|
} else {
|
|
MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
|
|
assert(MI && "Def has no defining instruction");
|
|
|
|
if (Reg.isVirtual()) {
|
|
// Iterate over all operands to find relevant definitions
|
|
bool HasDef = false;
|
|
for (const MachineOperand &Op : MI->all_defs()) {
|
|
if (Op.getReg() != Reg)
|
|
continue;
|
|
|
|
// Compute lanes defined and overlap with use
|
|
LaneBitmask OpLanes =
|
|
Op.isUndef() ? LaneBitmask::getAll()
|
|
: TRI->getSubRegIndexLaneMask(Op.getSubReg());
|
|
LaneBitmask Overlap = (UseLanes & OpLanes);
|
|
|
|
// Record if this instruction defined any of use
|
|
HasDef |= Overlap.any();
|
|
|
|
// Mark any lanes defined
|
|
DefinedLanes |= OpLanes;
|
|
}
|
|
|
|
// Check if all lanes of use have been defined
|
|
if ((DefinedLanes & UseLanes) != UseLanes) {
|
|
// Definition not complete; need to process input value
|
|
LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
|
|
if (const VNInfo *VN = LRQ.valueIn()) {
|
|
if (!Visited.count(VisitKey(VN, DefinedLanes)))
|
|
NextValue = VN;
|
|
}
|
|
}
|
|
|
|
// Only mark the instruction if it defines some part of the use
|
|
if (HasDef)
|
|
markInstruction(*MI, Flag, Worklist);
|
|
} else {
|
|
// For physical registers simply mark the defining instruction
|
|
markInstruction(*MI, Flag, Worklist);
|
|
}
|
|
}
|
|
|
|
if (!NextValue && !PhiStack.empty()) {
|
|
// Reach end of chain; revert to processing last phi
|
|
PhiEntry &Entry = PhiStack.back();
|
|
NextValue = Entry.Phi;
|
|
NextPredIdx = Entry.PredIdx;
|
|
DefinedLanes = Entry.DefinedLanes;
|
|
PhiStack.pop_back();
|
|
}
|
|
|
|
Value = NextValue;
|
|
} while (Value);
|
|
}
|
|
|
|
void SIWholeQuadMode::markOperand(const MachineInstr &MI,
|
|
const MachineOperand &Op, char Flag,
|
|
std::vector<WorkItem> &Worklist) {
|
|
assert(Op.isReg());
|
|
Register Reg = Op.getReg();
|
|
|
|
// Ignore some hardware registers
|
|
switch (Reg) {
|
|
case AMDGPU::EXEC:
|
|
case AMDGPU::EXEC_LO:
|
|
return;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
|
|
<< " for " << MI);
|
|
if (Reg.isVirtual()) {
|
|
LiveRange &LR = LIS->getInterval(Reg);
|
|
markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
|
|
} else {
|
|
// Handle physical registers that we need to track; this is mostly relevant
|
|
// for VCC, which can appear as the (implicit) input of a uniform branch,
|
|
// e.g. when a loop counter is stored in a VGPR.
|
|
for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
|
|
LiveRange &LR = LIS->getRegUnit(Unit);
|
|
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
|
|
if (Value)
|
|
markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Mark all instructions defining the uses in \p MI with \p Flag.
|
|
void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
|
|
std::vector<WorkItem> &Worklist) {
|
|
LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
|
|
<< MI);
|
|
|
|
for (const MachineOperand &Use : MI.all_uses())
|
|
markOperand(MI, Use, Flag, Worklist);
|
|
}
|
|
|
|
// Scan instructions to determine which ones require an Exact execmask and
|
|
// which ones seed WQM requirements.
|
|
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
|
|
std::vector<WorkItem> &Worklist) {
|
|
char GlobalFlags = 0;
|
|
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
|
|
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
|
|
bool HasImplicitDerivatives =
|
|
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
|
|
|
|
// We need to visit the basic blocks in reverse post-order so that we visit
|
|
// defs before uses, in particular so that we don't accidentally mark an
|
|
// instruction as needing e.g. WQM before visiting it and realizing it needs
|
|
// WQM disabled.
|
|
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
|
|
for (MachineBasicBlock *MBB : RPOT) {
|
|
BlockInfo &BBI = Blocks[MBB];
|
|
|
|
for (MachineInstr &MI : *MBB) {
|
|
InstrInfo &III = Instructions[&MI];
|
|
unsigned Opcode = MI.getOpcode();
|
|
char Flags = 0;
|
|
|
|
if (TII->isWQM(Opcode)) {
|
|
// If LOD is not supported WQM is not needed.
|
|
// Only generate implicit WQM if implicit derivatives are required.
|
|
// This avoids inserting unintended WQM if a shader type without
|
|
// implicit derivatives uses an image sampling instruction.
|
|
if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
|
|
// Sampling instructions don't need to produce results for all pixels
|
|
// in a quad, they just require all inputs of a quad to have been
|
|
// computed for derivatives.
|
|
markInstructionUses(MI, StateWQM, Worklist);
|
|
GlobalFlags |= StateWQM;
|
|
}
|
|
} else if (Opcode == AMDGPU::WQM) {
|
|
// The WQM intrinsic requires its output to have all the helper lanes
|
|
// correct, so we need it to be in WQM.
|
|
Flags = StateWQM;
|
|
LowerToCopyInstrs.insert(&MI);
|
|
} else if (Opcode == AMDGPU::SOFT_WQM) {
|
|
LowerToCopyInstrs.insert(&MI);
|
|
SoftWQMInstrs.push_back(&MI);
|
|
} else if (Opcode == AMDGPU::STRICT_WWM) {
|
|
// The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
|
|
// it needs to be executed in WQM or Exact so that its copy doesn't
|
|
// clobber inactive lanes.
|
|
markInstructionUses(MI, StateStrictWWM, Worklist);
|
|
GlobalFlags |= StateStrictWWM;
|
|
LowerToMovInstrs.push_back(&MI);
|
|
} else if (Opcode == AMDGPU::STRICT_WQM ||
|
|
TII->isDualSourceBlendEXP(MI)) {
|
|
// STRICT_WQM is similar to STRICTWWM, but instead of enabling all
|
|
// threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
|
|
// quads that have at least one active thread.
|
|
markInstructionUses(MI, StateStrictWQM, Worklist);
|
|
GlobalFlags |= StateStrictWQM;
|
|
|
|
if (Opcode == AMDGPU::STRICT_WQM) {
|
|
LowerToMovInstrs.push_back(&MI);
|
|
} else {
|
|
// Dual source blend export acts as implicit strict-wqm, its sources
|
|
// need to be shuffled in strict wqm, but the export itself needs to
|
|
// run in exact mode.
|
|
BBI.Needs |= StateExact;
|
|
if (!(BBI.InNeeds & StateExact)) {
|
|
BBI.InNeeds |= StateExact;
|
|
Worklist.emplace_back(MBB);
|
|
}
|
|
GlobalFlags |= StateExact;
|
|
III.Disabled = StateWQM | StateStrict;
|
|
}
|
|
} else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
|
|
Opcode == AMDGPU::DS_PARAM_LOAD ||
|
|
Opcode == AMDGPU::LDS_DIRECT_LOAD ||
|
|
Opcode == AMDGPU::DS_DIRECT_LOAD) {
|
|
// Mark these STRICTWQM, but only for the instruction, not its operands.
|
|
// This avoid unnecessarily marking M0 as requiring WQM.
|
|
III.Needs |= StateStrictWQM;
|
|
GlobalFlags |= StateStrictWQM;
|
|
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
|
|
// Disable strict states; StrictWQM will be added as required later.
|
|
III.Disabled = StateStrict;
|
|
MachineOperand &Inactive = MI.getOperand(4);
|
|
if (Inactive.isReg()) {
|
|
if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
|
|
LowerToCopyInstrs.insert(&MI);
|
|
else
|
|
markOperand(MI, Inactive, StateStrictWWM, Worklist);
|
|
}
|
|
SetInactiveInstrs.push_back(&MI);
|
|
BBI.NeedsLowering = true;
|
|
} else if (TII->isDisableWQM(MI)) {
|
|
BBI.Needs |= StateExact;
|
|
if (!(BBI.InNeeds & StateExact)) {
|
|
BBI.InNeeds |= StateExact;
|
|
Worklist.emplace_back(MBB);
|
|
}
|
|
GlobalFlags |= StateExact;
|
|
III.Disabled = StateWQM | StateStrict;
|
|
} else if (Opcode == AMDGPU::SI_PS_LIVE ||
|
|
Opcode == AMDGPU::SI_LIVE_MASK) {
|
|
LiveMaskQueries.push_back(&MI);
|
|
} else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
|
|
Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
|
|
Opcode == AMDGPU::SI_DEMOTE_I1) {
|
|
KillInstrs.push_back(&MI);
|
|
BBI.NeedsLowering = true;
|
|
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
|
|
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
|
|
Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
|
|
InitExecInstrs.push_back(&MI);
|
|
} else if (WQMOutputs) {
|
|
// The function is in machine SSA form, which means that physical
|
|
// VGPRs correspond to shader inputs and outputs. Inputs are
|
|
// only used, outputs are only defined.
|
|
// FIXME: is this still valid?
|
|
for (const MachineOperand &MO : MI.defs()) {
|
|
Register Reg = MO.getReg();
|
|
if (Reg.isPhysical() &&
|
|
TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
|
|
Flags = StateWQM;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (Flags) {
|
|
markInstruction(MI, Flags, Worklist);
|
|
GlobalFlags |= Flags;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
|
|
// ever used anywhere in the function. This implements the corresponding
|
|
// semantics of @llvm.amdgcn.set.inactive.
|
|
// Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
|
|
if (GlobalFlags & StateWQM) {
|
|
for (MachineInstr *MI : SetInactiveInstrs)
|
|
markInstruction(*MI, StateWQM, Worklist);
|
|
for (MachineInstr *MI : SoftWQMInstrs)
|
|
markInstruction(*MI, StateWQM, Worklist);
|
|
}
|
|
|
|
return GlobalFlags;
|
|
}
|
|
|
|
void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
|
|
std::vector<WorkItem>& Worklist) {
|
|
MachineBasicBlock *MBB = MI.getParent();
|
|
InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
|
|
BlockInfo &BI = Blocks[MBB];
|
|
|
|
// Control flow-type instructions and stores to temporary memory that are
|
|
// followed by WQM computations must themselves be in WQM.
|
|
if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
|
|
(MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
|
|
Instructions[&MI].Needs = StateWQM;
|
|
II.Needs = StateWQM;
|
|
}
|
|
|
|
// Propagate to block level
|
|
if (II.Needs & StateWQM) {
|
|
BI.Needs |= StateWQM;
|
|
if (!(BI.InNeeds & StateWQM)) {
|
|
BI.InNeeds |= StateWQM;
|
|
Worklist.emplace_back(MBB);
|
|
}
|
|
}
|
|
|
|
// Propagate backwards within block
|
|
if (MachineInstr *PrevMI = MI.getPrevNode()) {
|
|
char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
|
|
if (!PrevMI->isPHI()) {
|
|
InstrInfo &PrevII = Instructions[PrevMI];
|
|
if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
|
|
PrevII.OutNeeds |= InNeeds;
|
|
Worklist.emplace_back(PrevMI);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Propagate WQM flag to instruction inputs
|
|
assert(!(II.Needs & StateExact));
|
|
|
|
if (II.Needs != 0)
|
|
markInstructionUses(MI, II.Needs, Worklist);
|
|
|
|
// Ensure we process a block containing StrictWWM/StrictWQM, even if it does
|
|
// not require any WQM transitions.
|
|
if (II.Needs & StateStrictWWM)
|
|
BI.Needs |= StateStrictWWM;
|
|
if (II.Needs & StateStrictWQM)
|
|
BI.Needs |= StateStrictWQM;
|
|
}
|
|
|
|
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
|
|
std::vector<WorkItem>& Worklist) {
|
|
BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
|
|
|
|
// Propagate through instructions
|
|
if (!MBB.empty()) {
|
|
MachineInstr *LastMI = &*MBB.rbegin();
|
|
InstrInfo &LastII = Instructions[LastMI];
|
|
if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
|
|
LastII.OutNeeds |= BI.OutNeeds;
|
|
Worklist.emplace_back(LastMI);
|
|
}
|
|
}
|
|
|
|
// Predecessor blocks must provide for our WQM/Exact needs.
|
|
for (MachineBasicBlock *Pred : MBB.predecessors()) {
|
|
BlockInfo &PredBI = Blocks[Pred];
|
|
if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
|
|
continue;
|
|
|
|
PredBI.OutNeeds |= BI.InNeeds;
|
|
PredBI.InNeeds |= BI.InNeeds;
|
|
Worklist.emplace_back(Pred);
|
|
}
|
|
|
|
// All successors must be prepared to accept the same set of WQM/Exact data.
|
|
for (MachineBasicBlock *Succ : MBB.successors()) {
|
|
BlockInfo &SuccBI = Blocks[Succ];
|
|
if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
|
|
continue;
|
|
|
|
SuccBI.InNeeds |= BI.OutNeeds;
|
|
Worklist.emplace_back(Succ);
|
|
}
|
|
}
|
|
|
|
char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
|
|
std::vector<WorkItem> Worklist;
|
|
char GlobalFlags = scanInstructions(MF, Worklist);
|
|
|
|
while (!Worklist.empty()) {
|
|
WorkItem WI = Worklist.back();
|
|
Worklist.pop_back();
|
|
|
|
if (WI.MI)
|
|
propagateInstruction(*WI.MI, Worklist);
|
|
else
|
|
propagateBlock(*WI.MBB, Worklist);
|
|
}
|
|
|
|
return GlobalFlags;
|
|
}
|
|
|
|
MachineBasicBlock::iterator
|
|
SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator Before) {
|
|
Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
|
|
|
MachineInstr *Save =
|
|
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
|
|
.addReg(AMDGPU::SCC);
|
|
MachineInstr *Restore =
|
|
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
|
|
.addReg(SaveReg);
|
|
|
|
LIS->InsertMachineInstrInMaps(*Save);
|
|
LIS->InsertMachineInstrInMaps(*Restore);
|
|
LIS->createAndComputeVirtRegInterval(SaveReg);
|
|
|
|
return Restore;
|
|
}
|
|
|
|
void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
|
|
MachineBasicBlock *BB = TermMI->getParent();
|
|
LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
|
|
<< *TermMI << "\n");
|
|
|
|
MachineBasicBlock *SplitBB =
|
|
BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
|
|
|
|
// Convert last instruction in block to a terminator.
|
|
// Note: this only covers the expected patterns
|
|
unsigned NewOpcode = 0;
|
|
switch (TermMI->getOpcode()) {
|
|
case AMDGPU::S_AND_B32:
|
|
NewOpcode = AMDGPU::S_AND_B32_term;
|
|
break;
|
|
case AMDGPU::S_AND_B64:
|
|
NewOpcode = AMDGPU::S_AND_B64_term;
|
|
break;
|
|
case AMDGPU::S_MOV_B32:
|
|
NewOpcode = AMDGPU::S_MOV_B32_term;
|
|
break;
|
|
case AMDGPU::S_MOV_B64:
|
|
NewOpcode = AMDGPU::S_MOV_B64_term;
|
|
break;
|
|
case AMDGPU::S_ANDN2_B32:
|
|
NewOpcode = AMDGPU::S_ANDN2_B32_term;
|
|
break;
|
|
case AMDGPU::S_ANDN2_B64:
|
|
NewOpcode = AMDGPU::S_ANDN2_B64_term;
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unexpected instruction");
|
|
}
|
|
|
|
// These terminators fallthrough to the next block, no need to add an
|
|
// unconditional branch to the next block (SplitBB).
|
|
TermMI->setDesc(TII->get(NewOpcode));
|
|
|
|
if (SplitBB != BB) {
|
|
// Update dominator trees
|
|
using DomTreeT = DomTreeBase<MachineBasicBlock>;
|
|
SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
|
|
for (MachineBasicBlock *Succ : SplitBB->successors()) {
|
|
DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
|
|
DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
|
|
}
|
|
DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
|
|
if (MDT)
|
|
MDT->applyUpdates(DTUpdates);
|
|
if (PDT)
|
|
PDT->applyUpdates(DTUpdates);
|
|
}
|
|
}
|
|
|
|
MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
|
|
assert(LiveMaskReg.isVirtual());
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
unsigned Opcode = 0;
|
|
|
|
assert(MI.getOperand(0).isReg());
|
|
|
|
// Comparison is for live lanes; however here we compute the inverse
|
|
// (killed lanes). This is because VCMP will always generate 0 bits
|
|
// for inactive lanes so a mask of live lanes would not be correct
|
|
// inside control flow.
|
|
// Invert the comparison by swapping the operands and adjusting
|
|
// the comparison codes.
|
|
|
|
switch (MI.getOperand(2).getImm()) {
|
|
case ISD::SETUEQ:
|
|
Opcode = AMDGPU::V_CMP_LG_F32_e64;
|
|
break;
|
|
case ISD::SETUGT:
|
|
Opcode = AMDGPU::V_CMP_GE_F32_e64;
|
|
break;
|
|
case ISD::SETUGE:
|
|
Opcode = AMDGPU::V_CMP_GT_F32_e64;
|
|
break;
|
|
case ISD::SETULT:
|
|
Opcode = AMDGPU::V_CMP_LE_F32_e64;
|
|
break;
|
|
case ISD::SETULE:
|
|
Opcode = AMDGPU::V_CMP_LT_F32_e64;
|
|
break;
|
|
case ISD::SETUNE:
|
|
Opcode = AMDGPU::V_CMP_EQ_F32_e64;
|
|
break;
|
|
case ISD::SETO:
|
|
Opcode = AMDGPU::V_CMP_O_F32_e64;
|
|
break;
|
|
case ISD::SETUO:
|
|
Opcode = AMDGPU::V_CMP_U_F32_e64;
|
|
break;
|
|
case ISD::SETOEQ:
|
|
case ISD::SETEQ:
|
|
Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
|
|
break;
|
|
case ISD::SETOGT:
|
|
case ISD::SETGT:
|
|
Opcode = AMDGPU::V_CMP_NLT_F32_e64;
|
|
break;
|
|
case ISD::SETOGE:
|
|
case ISD::SETGE:
|
|
Opcode = AMDGPU::V_CMP_NLE_F32_e64;
|
|
break;
|
|
case ISD::SETOLT:
|
|
case ISD::SETLT:
|
|
Opcode = AMDGPU::V_CMP_NGT_F32_e64;
|
|
break;
|
|
case ISD::SETOLE:
|
|
case ISD::SETLE:
|
|
Opcode = AMDGPU::V_CMP_NGE_F32_e64;
|
|
break;
|
|
case ISD::SETONE:
|
|
case ISD::SETNE:
|
|
Opcode = AMDGPU::V_CMP_NLG_F32_e64;
|
|
break;
|
|
default:
|
|
llvm_unreachable("invalid ISD:SET cond code");
|
|
}
|
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
// Pick opcode based on comparison type.
|
|
MachineInstr *VcmpMI;
|
|
const MachineOperand &Op0 = MI.getOperand(0);
|
|
const MachineOperand &Op1 = MI.getOperand(1);
|
|
|
|
// VCC represents lanes killed.
|
|
Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
|
|
|
|
if (TRI->isVGPR(*MRI, Op0.getReg())) {
|
|
Opcode = AMDGPU::getVOPe32(Opcode);
|
|
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
|
|
} else {
|
|
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
|
|
.addReg(VCC, RegState::Define)
|
|
.addImm(0) // src0 modifiers
|
|
.add(Op1)
|
|
.addImm(0) // src1 modifiers
|
|
.add(Op0)
|
|
.addImm(0); // omod
|
|
}
|
|
|
|
MachineInstr *MaskUpdateMI =
|
|
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
|
|
.addReg(LiveMaskReg)
|
|
.addReg(VCC);
|
|
|
|
// State of SCC represents whether any lanes are live in mask,
|
|
// if SCC is 0 then no lanes will be alive anymore.
|
|
MachineInstr *EarlyTermMI =
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
|
|
|
|
MachineInstr *ExecMaskMI =
|
|
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
|
|
|
|
assert(MBB.succ_size() == 1);
|
|
|
|
// Update live intervals
|
|
LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
|
|
MBB.remove(&MI);
|
|
|
|
LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
|
|
LIS->InsertMachineInstrInMaps(*EarlyTermMI);
|
|
LIS->InsertMachineInstrInMaps(*ExecMaskMI);
|
|
|
|
return ExecMaskMI;
|
|
}
|
|
|
|
MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
|
|
assert(LiveMaskReg.isVirtual());
|
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
MachineInstr *MaskUpdateMI = nullptr;
|
|
|
|
const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
|
|
const MachineOperand &Op = MI.getOperand(0);
|
|
int64_t KillVal = MI.getOperand(1).getImm();
|
|
MachineInstr *ComputeKilledMaskMI = nullptr;
|
|
Register CndReg = !Op.isImm() ? Op.getReg() : Register();
|
|
Register TmpReg;
|
|
|
|
// Is this a static or dynamic kill?
|
|
if (Op.isImm()) {
|
|
if (Op.getImm() == KillVal) {
|
|
// Static: all active lanes are killed
|
|
MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
|
|
.addReg(LiveMaskReg)
|
|
.addReg(Exec);
|
|
} else {
|
|
// Static: kill does nothing
|
|
bool IsLastTerminator = std::next(MI.getIterator()) == MBB.end();
|
|
if (!IsLastTerminator) {
|
|
LIS->RemoveMachineInstrFromMaps(MI);
|
|
} else {
|
|
assert(MBB.succ_size() == 1 && MI.getOpcode() != AMDGPU::SI_DEMOTE_I1);
|
|
MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
|
|
.addMBB(*MBB.succ_begin());
|
|
LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
|
|
}
|
|
MBB.remove(&MI);
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
if (!KillVal) {
|
|
// Op represents live lanes after kill,
|
|
// so exec mask needs to be factored in.
|
|
TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
|
|
ComputeKilledMaskMI =
|
|
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), TmpReg).addReg(Exec).add(Op);
|
|
MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
|
|
.addReg(LiveMaskReg)
|
|
.addReg(TmpReg);
|
|
} else {
|
|
// Op represents lanes to kill
|
|
MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
|
|
.addReg(LiveMaskReg)
|
|
.add(Op);
|
|
}
|
|
}
|
|
|
|
// State of SCC represents whether any lanes are live in mask,
|
|
// if SCC is 0 then no lanes will be alive anymore.
|
|
MachineInstr *EarlyTermMI =
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
|
|
|
|
// In the case we got this far some lanes are still live,
|
|
// update EXEC to deactivate lanes as appropriate.
|
|
MachineInstr *NewTerm;
|
|
MachineInstr *WQMMaskMI = nullptr;
|
|
Register LiveMaskWQM;
|
|
if (IsDemote) {
|
|
// Demote - deactivate quads with only helper lanes
|
|
LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
|
|
WQMMaskMI =
|
|
BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
|
|
NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
|
|
.addReg(Exec)
|
|
.addReg(LiveMaskWQM);
|
|
} else {
|
|
// Kill - deactivate lanes no longer in live mask
|
|
if (Op.isImm()) {
|
|
unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
|
NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
|
|
} else if (!IsWQM) {
|
|
NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
|
|
.addReg(Exec)
|
|
.addReg(LiveMaskReg);
|
|
} else {
|
|
unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
|
|
NewTerm =
|
|
BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
|
|
}
|
|
}
|
|
|
|
// Update live intervals
|
|
LIS->RemoveMachineInstrFromMaps(MI);
|
|
MBB.remove(&MI);
|
|
assert(EarlyTermMI);
|
|
assert(MaskUpdateMI);
|
|
assert(NewTerm);
|
|
if (ComputeKilledMaskMI)
|
|
LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
|
|
LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
|
|
LIS->InsertMachineInstrInMaps(*EarlyTermMI);
|
|
if (WQMMaskMI)
|
|
LIS->InsertMachineInstrInMaps(*WQMMaskMI);
|
|
LIS->InsertMachineInstrInMaps(*NewTerm);
|
|
|
|
if (CndReg) {
|
|
LIS->removeInterval(CndReg);
|
|
LIS->createAndComputeVirtRegInterval(CndReg);
|
|
}
|
|
if (TmpReg)
|
|
LIS->createAndComputeVirtRegInterval(TmpReg);
|
|
if (LiveMaskWQM)
|
|
LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
|
|
|
|
return NewTerm;
|
|
}
|
|
|
|
// Replace (or supplement) instructions accessing live mask.
|
|
// This can only happen once all the live mask registers have been created
|
|
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
|
|
void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) {
|
|
if (!BI.NeedsLowering)
|
|
return;
|
|
|
|
LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
|
|
|
|
SmallVector<MachineInstr *, 4> SplitPoints;
|
|
Register ActiveLanesReg = 0;
|
|
char State = BI.InitialState;
|
|
|
|
for (MachineInstr &MI : llvm::make_early_inc_range(
|
|
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
|
|
auto MIState = StateTransition.find(&MI);
|
|
if (MIState != StateTransition.end())
|
|
State = MIState->second;
|
|
|
|
MachineInstr *SplitPoint = nullptr;
|
|
switch (MI.getOpcode()) {
|
|
case AMDGPU::SI_DEMOTE_I1:
|
|
case AMDGPU::SI_KILL_I1_TERMINATOR:
|
|
SplitPoint = lowerKillI1(MI, State == StateWQM);
|
|
break;
|
|
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
|
|
SplitPoint = lowerKillF32(MI);
|
|
break;
|
|
case AMDGPU::ENTER_STRICT_WWM:
|
|
ActiveLanesReg = MI.getOperand(0).getReg();
|
|
break;
|
|
case AMDGPU::EXIT_STRICT_WWM:
|
|
ActiveLanesReg = 0;
|
|
break;
|
|
case AMDGPU::V_SET_INACTIVE_B32:
|
|
if (ActiveLanesReg) {
|
|
LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
|
|
MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
|
|
MI.getOperand(5).setReg(ActiveLanesReg);
|
|
LIS->shrinkToUses(&LI);
|
|
} else {
|
|
assert(State == StateExact || State == StateWQM);
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
if (SplitPoint)
|
|
SplitPoints.push_back(SplitPoint);
|
|
}
|
|
|
|
// Perform splitting after instruction scan to simplify iteration.
|
|
for (MachineInstr *MI : SplitPoints)
|
|
splitBlock(MI);
|
|
}
|
|
|
|
// Return an iterator in the (inclusive) range [First, Last] at which
|
|
// instructions can be safely inserted, keeping in mind that some of the
|
|
// instructions we want to add necessarily clobber SCC.
|
|
MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
|
|
MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
|
|
MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
|
|
if (!SaveSCC)
|
|
return PreferLast ? Last : First;
|
|
|
|
LiveRange &LR =
|
|
LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
|
|
auto MBBE = MBB.end();
|
|
SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
|
|
: LIS->getMBBEndIdx(&MBB);
|
|
SlotIndex LastIdx =
|
|
Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
|
|
SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
|
|
const LiveRange::Segment *S;
|
|
|
|
for (;;) {
|
|
S = LR.getSegmentContaining(Idx);
|
|
if (!S)
|
|
break;
|
|
|
|
if (PreferLast) {
|
|
SlotIndex Next = S->start.getBaseIndex();
|
|
if (Next < FirstIdx)
|
|
break;
|
|
Idx = Next;
|
|
} else {
|
|
MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
|
|
assert(EndMI && "Segment does not end on valid instruction");
|
|
auto NextI = std::next(EndMI->getIterator());
|
|
if (NextI == MBB.end())
|
|
break;
|
|
SlotIndex Next = LIS->getInstructionIndex(*NextI);
|
|
if (Next > LastIdx)
|
|
break;
|
|
Idx = Next;
|
|
}
|
|
}
|
|
|
|
MachineBasicBlock::iterator MBBI;
|
|
|
|
if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
|
|
MBBI = MI;
|
|
else {
|
|
assert(Idx == LIS->getMBBEndIdx(&MBB));
|
|
MBBI = MBB.end();
|
|
}
|
|
|
|
// Move insertion point past any operations modifying EXEC.
|
|
// This assumes that the value of SCC defined by any of these operations
|
|
// does not need to be preserved.
|
|
while (MBBI != Last) {
|
|
bool IsExecDef = false;
|
|
for (const MachineOperand &MO : MBBI->all_defs()) {
|
|
IsExecDef |=
|
|
MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
|
|
}
|
|
if (!IsExecDef)
|
|
break;
|
|
MBBI++;
|
|
S = nullptr;
|
|
}
|
|
|
|
if (S)
|
|
MBBI = saveSCC(MBB, MBBI);
|
|
|
|
return MBBI;
|
|
}
|
|
|
|
void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator Before,
|
|
Register SaveWQM) {
|
|
assert(LiveMaskReg.isVirtual());
|
|
|
|
bool IsTerminator = Before == MBB.end();
|
|
if (!IsTerminator) {
|
|
auto FirstTerm = MBB.getFirstTerminator();
|
|
if (FirstTerm != MBB.end()) {
|
|
SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
|
|
SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
|
|
IsTerminator = BeforeIdx > FirstTermIdx;
|
|
}
|
|
}
|
|
|
|
MachineInstr *MI;
|
|
|
|
if (SaveWQM) {
|
|
unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
|
|
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
|
|
.addReg(LiveMaskReg);
|
|
} else {
|
|
unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
|
|
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
|
|
.addReg(Exec)
|
|
.addReg(LiveMaskReg);
|
|
}
|
|
|
|
LIS->InsertMachineInstrInMaps(*MI);
|
|
StateTransition[MI] = StateExact;
|
|
}
|
|
|
|
void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator Before,
|
|
Register SavedWQM) {
|
|
MachineInstr *MI;
|
|
|
|
if (SavedWQM) {
|
|
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
|
|
.addReg(SavedWQM);
|
|
} else {
|
|
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
|
|
}
|
|
|
|
LIS->InsertMachineInstrInMaps(*MI);
|
|
StateTransition[MI] = StateWQM;
|
|
}
|
|
|
|
void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator Before,
|
|
Register SaveOrig, char StrictStateNeeded) {
|
|
MachineInstr *MI;
|
|
assert(SaveOrig);
|
|
assert(StrictStateNeeded == StateStrictWWM ||
|
|
StrictStateNeeded == StateStrictWQM);
|
|
|
|
if (StrictStateNeeded == StateStrictWWM) {
|
|
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
|
|
SaveOrig)
|
|
.addImm(-1);
|
|
} else {
|
|
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
|
|
SaveOrig)
|
|
.addImm(-1);
|
|
}
|
|
LIS->InsertMachineInstrInMaps(*MI);
|
|
StateTransition[MI] = StrictStateNeeded;
|
|
}
|
|
|
|
void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator Before,
|
|
Register SavedOrig, char NonStrictState,
|
|
char CurrentStrictState) {
|
|
MachineInstr *MI;
|
|
|
|
assert(SavedOrig);
|
|
assert(CurrentStrictState == StateStrictWWM ||
|
|
CurrentStrictState == StateStrictWQM);
|
|
|
|
if (CurrentStrictState == StateStrictWWM) {
|
|
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
|
|
Exec)
|
|
.addReg(SavedOrig);
|
|
} else {
|
|
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
|
|
Exec)
|
|
.addReg(SavedOrig);
|
|
}
|
|
LIS->InsertMachineInstrInMaps(*MI);
|
|
StateTransition[MI] = NonStrictState;
|
|
}
|
|
|
|
void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI,
|
|
bool IsEntry) {
|
|
// This is a non-entry block that is WQM throughout, so no need to do
|
|
// anything.
|
|
if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
|
|
BI.InitialState = StateWQM;
|
|
return;
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
|
|
<< ":\n");
|
|
|
|
Register SavedWQMReg;
|
|
Register SavedNonStrictReg;
|
|
bool WQMFromExec = IsEntry;
|
|
char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
|
|
char NonStrictState = 0;
|
|
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
|
|
|
|
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
|
|
if (IsEntry) {
|
|
// Skip the instruction that saves LiveMask
|
|
if (II != IE && II->getOpcode() == AMDGPU::COPY &&
|
|
II->getOperand(1).getReg() == TRI->getExec())
|
|
++II;
|
|
}
|
|
|
|
// This stores the first instruction where it's safe to switch from WQM to
|
|
// Exact or vice versa.
|
|
MachineBasicBlock::iterator FirstWQM = IE;
|
|
|
|
// This stores the first instruction where it's safe to switch from Strict
|
|
// mode to Exact/WQM or to switch to Strict mode. It must always be the same
|
|
// as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
|
|
// be safe to switch to/from WQM as well.
|
|
MachineBasicBlock::iterator FirstStrict = IE;
|
|
|
|
// Record initial state is block information.
|
|
BI.InitialState = State;
|
|
|
|
for (unsigned Idx = 0;; ++Idx) {
|
|
MachineBasicBlock::iterator Next = II;
|
|
char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
|
|
char OutNeeds = 0;
|
|
|
|
if (FirstWQM == IE)
|
|
FirstWQM = II;
|
|
|
|
if (FirstStrict == IE)
|
|
FirstStrict = II;
|
|
|
|
// Adjust needs if this is first instruction of WQM requiring shader.
|
|
if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
|
|
Needs = StateWQM;
|
|
|
|
// First, figure out the allowed states (Needs) based on the propagated
|
|
// flags.
|
|
if (II != IE) {
|
|
MachineInstr &MI = *II;
|
|
|
|
if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
|
|
auto III = Instructions.find(&MI);
|
|
if (III != Instructions.end()) {
|
|
if (III->second.Needs & StateStrictWWM)
|
|
Needs = StateStrictWWM;
|
|
else if (III->second.Needs & StateStrictWQM)
|
|
Needs = StateStrictWQM;
|
|
else if (III->second.Needs & StateWQM)
|
|
Needs = StateWQM;
|
|
else
|
|
Needs &= ~III->second.Disabled;
|
|
OutNeeds = III->second.OutNeeds;
|
|
}
|
|
} else {
|
|
// If the instruction doesn't actually need a correct EXEC, then we can
|
|
// safely leave Strict mode enabled.
|
|
Needs = StateExact | StateWQM | StateStrict;
|
|
}
|
|
|
|
// Exact mode exit can occur in terminators, but must be before branches.
|
|
if (MI.isBranch() && OutNeeds == StateExact)
|
|
Needs = StateExact;
|
|
|
|
++Next;
|
|
} else {
|
|
// End of basic block
|
|
if (BI.OutNeeds & StateWQM)
|
|
Needs = StateWQM;
|
|
else if (BI.OutNeeds == StateExact)
|
|
Needs = StateExact;
|
|
else
|
|
Needs = StateWQM | StateExact;
|
|
}
|
|
|
|
// Now, transition if necessary.
|
|
if (!(Needs & State)) {
|
|
MachineBasicBlock::iterator First;
|
|
if (State == StateStrictWWM || Needs == StateStrictWWM ||
|
|
State == StateStrictWQM || Needs == StateStrictWQM) {
|
|
// We must switch to or from Strict mode.
|
|
First = FirstStrict;
|
|
} else {
|
|
// We only need to switch to/from WQM, so we can use FirstWQM.
|
|
First = FirstWQM;
|
|
}
|
|
|
|
// Whether we need to save SCC depends on start and end states.
|
|
bool SaveSCC = false;
|
|
switch (State) {
|
|
case StateExact:
|
|
case StateStrictWWM:
|
|
case StateStrictWQM:
|
|
// Exact/Strict -> Strict: save SCC
|
|
// Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
|
|
// Exact/Strict -> Exact: no save
|
|
SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
|
|
break;
|
|
case StateWQM:
|
|
// WQM -> Exact/Strict: save SCC
|
|
SaveSCC = !(Needs & StateWQM);
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unknown state");
|
|
break;
|
|
}
|
|
char StartState = State & StateStrict ? NonStrictState : State;
|
|
bool WQMToExact =
|
|
StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
|
|
bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
|
|
!(Needs & StateExact);
|
|
bool PreferLast = Needs == StateWQM;
|
|
// Exact regions in divergent control flow may run at EXEC=0, so try to
|
|
// exclude instructions with unexpected effects from them.
|
|
// FIXME: ideally we would branch over these when EXEC=0,
|
|
// but this requires updating implicit values, live intervals and CFG.
|
|
if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
|
|
for (MachineBasicBlock::iterator I = First; I != II; ++I) {
|
|
if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) {
|
|
PreferLast = WQMToExact;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
MachineBasicBlock::iterator Before =
|
|
prepareInsertion(MBB, First, II, PreferLast, SaveSCC);
|
|
|
|
if (State & StateStrict) {
|
|
assert(State == StateStrictWWM || State == StateStrictWQM);
|
|
assert(SavedNonStrictReg);
|
|
fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
|
|
|
|
LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
|
|
SavedNonStrictReg = 0;
|
|
State = NonStrictState;
|
|
}
|
|
|
|
if (Needs & StateStrict) {
|
|
NonStrictState = State;
|
|
assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
|
|
assert(!SavedNonStrictReg);
|
|
SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
|
|
|
|
toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
|
|
State = Needs;
|
|
} else {
|
|
if (WQMToExact) {
|
|
if (!WQMFromExec && (OutNeeds & StateWQM)) {
|
|
assert(!SavedWQMReg);
|
|
SavedWQMReg = MRI->createVirtualRegister(BoolRC);
|
|
}
|
|
|
|
toExact(MBB, Before, SavedWQMReg);
|
|
State = StateExact;
|
|
} else if (ExactToWQM) {
|
|
assert(WQMFromExec == (SavedWQMReg == 0));
|
|
|
|
toWQM(MBB, Before, SavedWQMReg);
|
|
|
|
if (SavedWQMReg) {
|
|
LIS->createAndComputeVirtRegInterval(SavedWQMReg);
|
|
SavedWQMReg = 0;
|
|
}
|
|
State = StateWQM;
|
|
} else {
|
|
// We can get here if we transitioned from StrictWWM to a
|
|
// non-StrictWWM state that already matches our needs, but we
|
|
// shouldn't need to do anything.
|
|
assert(Needs & State);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (Needs != (StateExact | StateWQM | StateStrict)) {
|
|
if (Needs != (StateExact | StateWQM))
|
|
FirstWQM = IE;
|
|
FirstStrict = IE;
|
|
}
|
|
|
|
if (II == IE)
|
|
break;
|
|
|
|
II = Next;
|
|
}
|
|
assert(!SavedWQMReg);
|
|
assert(!SavedNonStrictReg);
|
|
}
|
|
|
|
bool SIWholeQuadMode::lowerLiveMaskQueries() {
|
|
for (MachineInstr *MI : LiveMaskQueries) {
|
|
const DebugLoc &DL = MI->getDebugLoc();
|
|
Register Dest = MI->getOperand(0).getReg();
|
|
|
|
MachineInstr *Copy =
|
|
BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
|
|
.addReg(LiveMaskReg);
|
|
|
|
LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
|
|
MI->eraseFromParent();
|
|
}
|
|
return !LiveMaskQueries.empty();
|
|
}
|
|
|
|
bool SIWholeQuadMode::lowerCopyInstrs() {
|
|
for (MachineInstr *MI : LowerToMovInstrs) {
|
|
assert(MI->getNumExplicitOperands() == 2);
|
|
|
|
const Register Reg = MI->getOperand(0).getReg();
|
|
|
|
const TargetRegisterClass *regClass =
|
|
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
|
|
if (TRI->isVGPRClass(regClass)) {
|
|
const unsigned MovOp = TII->getMovOpcode(regClass);
|
|
MI->setDesc(TII->get(MovOp));
|
|
|
|
// Check that it already implicitly depends on exec (like all VALU movs
|
|
// should do).
|
|
assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
|
|
return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
|
|
}));
|
|
} else {
|
|
// Remove early-clobber and exec dependency from simple SGPR copies.
|
|
// This allows some to be eliminated during/post RA.
|
|
LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
|
|
if (MI->getOperand(0).isEarlyClobber()) {
|
|
LIS->removeInterval(Reg);
|
|
MI->getOperand(0).setIsEarlyClobber(false);
|
|
LIS->createAndComputeVirtRegInterval(Reg);
|
|
}
|
|
int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
|
|
while (Index >= 0) {
|
|
MI->removeOperand(Index);
|
|
Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
|
|
}
|
|
MI->setDesc(TII->get(AMDGPU::COPY));
|
|
LLVM_DEBUG(dbgs() << " -> " << *MI);
|
|
}
|
|
}
|
|
for (MachineInstr *MI : LowerToCopyInstrs) {
|
|
LLVM_DEBUG(dbgs() << "simplify: " << *MI);
|
|
|
|
if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
|
|
assert(MI->getNumExplicitOperands() == 6);
|
|
|
|
LiveInterval *RecomputeLI = nullptr;
|
|
if (MI->getOperand(4).isReg())
|
|
RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
|
|
|
|
MI->removeOperand(5);
|
|
MI->removeOperand(4);
|
|
MI->removeOperand(3);
|
|
MI->removeOperand(1);
|
|
|
|
if (RecomputeLI)
|
|
LIS->shrinkToUses(RecomputeLI);
|
|
} else {
|
|
assert(MI->getNumExplicitOperands() == 2);
|
|
}
|
|
|
|
unsigned CopyOp = MI->getOperand(1).isReg()
|
|
? (unsigned)AMDGPU::COPY
|
|
: TII->getMovOpcode(TRI->getRegClassForOperandReg(
|
|
*MRI, MI->getOperand(0)));
|
|
MI->setDesc(TII->get(CopyOp));
|
|
LLVM_DEBUG(dbgs() << " -> " << *MI);
|
|
}
|
|
return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
|
|
}
|
|
|
|
bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
|
|
for (MachineInstr *MI : KillInstrs) {
|
|
MachineInstr *SplitPoint = nullptr;
|
|
switch (MI->getOpcode()) {
|
|
case AMDGPU::SI_DEMOTE_I1:
|
|
case AMDGPU::SI_KILL_I1_TERMINATOR:
|
|
SplitPoint = lowerKillI1(*MI, IsWQM);
|
|
break;
|
|
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
|
|
SplitPoint = lowerKillF32(*MI);
|
|
break;
|
|
}
|
|
if (SplitPoint)
|
|
splitBlock(SplitPoint);
|
|
}
|
|
return !KillInstrs.empty();
|
|
}
|
|
|
|
void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
|
|
MachineBasicBlock *MBB = MI.getParent();
|
|
bool IsWave32 = ST->isWave32();
|
|
|
|
if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
|
|
assert(MBB == &MBB->getParent()->front() &&
|
|
"init whole wave not in entry block");
|
|
Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
|
|
MachineInstr *SaveExec =
|
|
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
|
|
TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
|
|
: AMDGPU::S_OR_SAVEEXEC_B64),
|
|
EntryExec)
|
|
.addImm(-1);
|
|
|
|
// Replace all uses of MI's destination reg with EntryExec.
|
|
MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
|
|
|
|
if (LIS) {
|
|
LIS->RemoveMachineInstrFromMaps(MI);
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
|
|
if (LIS) {
|
|
LIS->InsertMachineInstrInMaps(*SaveExec);
|
|
LIS->createAndComputeVirtRegInterval(EntryExec);
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
|
|
// This should be before all vector instructions.
|
|
MachineInstr *InitMI =
|
|
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
|
|
TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
|
|
Exec)
|
|
.addImm(MI.getOperand(0).getImm());
|
|
if (LIS) {
|
|
LIS->RemoveMachineInstrFromMaps(MI);
|
|
LIS->InsertMachineInstrInMaps(*InitMI);
|
|
}
|
|
MI.eraseFromParent();
|
|
return;
|
|
}
|
|
|
|
// Extract the thread count from an SGPR input and set EXEC accordingly.
|
|
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
|
|
//
|
|
// S_BFE_U32 count, input, {shift, 7}
|
|
// S_BFM_B64 exec, count, 0
|
|
// S_CMP_EQ_U32 count, 64
|
|
// S_CMOV_B64 exec, -1
|
|
Register InputReg = MI.getOperand(0).getReg();
|
|
MachineInstr *FirstMI = &*MBB->begin();
|
|
if (InputReg.isVirtual()) {
|
|
MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
|
|
assert(DefInstr && DefInstr->isCopy());
|
|
if (DefInstr->getParent() == MBB) {
|
|
if (DefInstr != FirstMI) {
|
|
// If the `InputReg` is defined in current block, we also need to
|
|
// move that instruction to the beginning of the block.
|
|
DefInstr->removeFromParent();
|
|
MBB->insert(FirstMI, DefInstr);
|
|
if (LIS)
|
|
LIS->handleMove(*DefInstr);
|
|
} else {
|
|
// If first instruction is definition then move pointer after it.
|
|
FirstMI = &*std::next(FirstMI->getIterator());
|
|
}
|
|
}
|
|
}
|
|
|
|
// Insert instruction sequence at block beginning (before vector operations).
|
|
const DebugLoc DL = MI.getDebugLoc();
|
|
const unsigned WavefrontSize = ST->getWavefrontSize();
|
|
const unsigned Mask = (WavefrontSize << 1) - 1;
|
|
Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
|
auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
|
|
.addReg(InputReg)
|
|
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
|
|
auto BfmMI =
|
|
BuildMI(*MBB, FirstMI, DL,
|
|
TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
|
|
.addReg(CountReg)
|
|
.addImm(0);
|
|
auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
|
|
.addReg(CountReg, RegState::Kill)
|
|
.addImm(WavefrontSize);
|
|
auto CmovMI =
|
|
BuildMI(*MBB, FirstMI, DL,
|
|
TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
|
|
Exec)
|
|
.addImm(-1);
|
|
|
|
if (!LIS) {
|
|
MI.eraseFromParent();
|
|
return;
|
|
}
|
|
|
|
LIS->RemoveMachineInstrFromMaps(MI);
|
|
MI.eraseFromParent();
|
|
|
|
LIS->InsertMachineInstrInMaps(*BfeMI);
|
|
LIS->InsertMachineInstrInMaps(*BfmMI);
|
|
LIS->InsertMachineInstrInMaps(*CmpMI);
|
|
LIS->InsertMachineInstrInMaps(*CmovMI);
|
|
|
|
LIS->removeInterval(InputReg);
|
|
LIS->createAndComputeVirtRegInterval(InputReg);
|
|
LIS->createAndComputeVirtRegInterval(CountReg);
|
|
}
|
|
|
|
/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
|
|
/// for instructions that depend on EXEC.
|
|
MachineBasicBlock::iterator
|
|
SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
|
|
MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
|
|
|
|
for (MachineInstr *MI : InitExecInstrs) {
|
|
// Try to handle undefined cases gracefully:
|
|
// - multiple INIT_EXEC instructions
|
|
// - INIT_EXEC instructions not in the entry block
|
|
if (MI->getParent() == &Entry)
|
|
InsertPt = std::next(MI->getIterator());
|
|
|
|
lowerInitExec(*MI);
|
|
Changed = true;
|
|
}
|
|
|
|
return InsertPt;
|
|
}
|
|
|
|
bool SIWholeQuadMode::run(MachineFunction &MF) {
|
|
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
|
|
<< " ------------- \n");
|
|
LLVM_DEBUG(MF.dump(););
|
|
|
|
Instructions.clear();
|
|
Blocks.clear();
|
|
LiveMaskQueries.clear();
|
|
LowerToCopyInstrs.clear();
|
|
LowerToMovInstrs.clear();
|
|
KillInstrs.clear();
|
|
InitExecInstrs.clear();
|
|
SetInactiveInstrs.clear();
|
|
StateTransition.clear();
|
|
|
|
if (ST->isWave32()) {
|
|
AndOpc = AMDGPU::S_AND_B32;
|
|
AndTermOpc = AMDGPU::S_AND_B32_term;
|
|
AndN2Opc = AMDGPU::S_ANDN2_B32;
|
|
XorOpc = AMDGPU::S_XOR_B32;
|
|
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
|
|
AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
|
|
WQMOpc = AMDGPU::S_WQM_B32;
|
|
Exec = AMDGPU::EXEC_LO;
|
|
} else {
|
|
AndOpc = AMDGPU::S_AND_B64;
|
|
AndTermOpc = AMDGPU::S_AND_B64_term;
|
|
AndN2Opc = AMDGPU::S_ANDN2_B64;
|
|
XorOpc = AMDGPU::S_XOR_B64;
|
|
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
|
|
AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
|
|
WQMOpc = AMDGPU::S_WQM_B64;
|
|
Exec = AMDGPU::EXEC;
|
|
}
|
|
|
|
const char GlobalFlags = analyzeFunction(MF);
|
|
bool Changed = false;
|
|
|
|
LiveMaskReg = Exec;
|
|
|
|
MachineBasicBlock &Entry = MF.front();
|
|
MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
|
|
|
|
// Store a copy of the original live mask when required
|
|
const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
|
|
const bool HasWaveModes = GlobalFlags & ~StateExact;
|
|
const bool HasKills = !KillInstrs.empty();
|
|
const bool UsesWQM = GlobalFlags & StateWQM;
|
|
if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
|
|
LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
|
|
MachineInstr *MI =
|
|
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
|
|
.addReg(Exec);
|
|
LIS->InsertMachineInstrInMaps(*MI);
|
|
Changed = true;
|
|
}
|
|
|
|
// Check if V_SET_INACTIVE was touched by a strict state mode.
|
|
// If so, promote to WWM; otherwise lower to COPY.
|
|
for (MachineInstr *MI : SetInactiveInstrs) {
|
|
if (LowerToCopyInstrs.contains(MI))
|
|
continue;
|
|
auto &Info = Instructions[MI];
|
|
if (Info.MarkedStates & StateStrict) {
|
|
Info.Needs |= StateStrictWWM;
|
|
Info.Disabled &= ~StateStrictWWM;
|
|
Blocks[MI->getParent()].Needs |= StateStrictWWM;
|
|
} else {
|
|
LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
|
|
LowerToCopyInstrs.insert(MI);
|
|
}
|
|
}
|
|
|
|
LLVM_DEBUG(printInfo());
|
|
|
|
Changed |= lowerLiveMaskQueries();
|
|
Changed |= lowerCopyInstrs();
|
|
|
|
if (!HasWaveModes) {
|
|
// No wave mode execution
|
|
Changed |= lowerKillInstrs(false);
|
|
} else if (GlobalFlags == StateWQM) {
|
|
// Shader only needs WQM
|
|
auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
|
|
.addReg(Exec);
|
|
LIS->InsertMachineInstrInMaps(*MI);
|
|
lowerKillInstrs(true);
|
|
Changed = true;
|
|
} else {
|
|
// Mark entry for WQM if required.
|
|
if (GlobalFlags & StateWQM)
|
|
Blocks[&Entry].InNeeds |= StateWQM;
|
|
// Wave mode switching requires full lowering pass.
|
|
for (auto &BII : Blocks)
|
|
processBlock(*BII.first, BII.second, BII.first == &Entry);
|
|
// Lowering blocks causes block splitting so perform as a second pass.
|
|
for (auto &BII : Blocks)
|
|
lowerBlock(*BII.first, BII.second);
|
|
Changed = true;
|
|
}
|
|
|
|
// Compute live range for live mask
|
|
if (LiveMaskReg != Exec)
|
|
LIS->createAndComputeVirtRegInterval(LiveMaskReg);
|
|
|
|
// Physical registers like SCC aren't tracked by default anyway, so just
|
|
// removing the ranges we computed is the simplest option for maintaining
|
|
// the analysis results.
|
|
LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
|
|
|
|
// If we performed any kills then recompute EXEC
|
|
if (!KillInstrs.empty() || !InitExecInstrs.empty())
|
|
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) {
|
|
LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
|
|
auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
|
|
MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
|
|
auto *PDTWrapper =
|
|
getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
|
|
MachinePostDominatorTree *PDT =
|
|
PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
|
|
SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
|
|
return Impl.run(MF);
|
|
}
|
|
|
|
PreservedAnalyses
|
|
SIWholeQuadModePass::run(MachineFunction &MF,
|
|
MachineFunctionAnalysisManager &MFAM) {
|
|
MFPropsModifier _(*this, MF);
|
|
|
|
LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
|
|
MachineDominatorTree *MDT =
|
|
MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
|
|
MachinePostDominatorTree *PDT =
|
|
MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
|
|
SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
|
|
bool Changed = Impl.run(MF);
|
|
if (!Changed)
|
|
return PreservedAnalyses::all();
|
|
|
|
PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
|
|
PA.preserve<SlotIndexesAnalysis>();
|
|
PA.preserve<LiveIntervalsAnalysis>();
|
|
PA.preserve<MachineDominatorTreeAnalysis>();
|
|
PA.preserve<MachinePostDominatorTreeAnalysis>();
|
|
return PA;
|
|
}
|