Files
clang-p2996/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
Pierre van Houtryve 63390dccd8 [GlobalISel] Add Predicates to GICombineRule
Small QoL change to allow Predicates to be used in GICombineRule.
Currently only one combine in the AMDGPU backend makes use of it.

The implementation is pretty simple to get started but of course we can expand this later on and optimize predicate checking better if needed.

Reviewed By: dsanders

Differential Revision: https://reviews.llvm.org/D136681
2022-10-26 07:13:40 +00:00

458 lines
15 KiB
C++

//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass does combining of machine instructions at the generic MI level,
// after the legalizer.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUCombinerHelper.h"
#include "AMDGPULegalizerInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
using namespace llvm;
using namespace MIPatternMatch;
class AMDGPUPostLegalizerCombinerHelper {
protected:
MachineIRBuilder &B;
MachineFunction &MF;
MachineRegisterInfo &MRI;
AMDGPUCombinerHelper &Helper;
public:
AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B,
AMDGPUCombinerHelper &Helper)
: B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
struct FMinFMaxLegacyInfo {
Register LHS;
Register RHS;
Register True;
Register False;
CmpInst::Predicate Pred;
};
// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
const FMinFMaxLegacyInfo &Info);
bool matchUCharToFloat(MachineInstr &MI);
void applyUCharToFloat(MachineInstr &MI);
bool matchRcpSqrtToRsq(MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo);
// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.
struct CvtF32UByteMatchInfo {
Register CvtVal;
unsigned ShiftOffset;
};
bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
void applyCvtF32UByteN(MachineInstr &MI,
const CvtF32UByteMatchInfo &MatchInfo);
bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
};
bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
// FIXME: Type predicate on pattern
if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
return false;
Register Cond = MI.getOperand(1).getReg();
if (!MRI.hasOneNonDBGUse(Cond) ||
!mi_match(Cond, MRI,
m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
return false;
Info.True = MI.getOperand(2).getReg();
Info.False = MI.getOperand(3).getReg();
if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
!(Info.LHS == Info.False && Info.RHS == Info.True))
return false;
switch (Info.Pred) {
case CmpInst::FCMP_FALSE:
case CmpInst::FCMP_OEQ:
case CmpInst::FCMP_ONE:
case CmpInst::FCMP_ORD:
case CmpInst::FCMP_UNO:
case CmpInst::FCMP_UEQ:
case CmpInst::FCMP_UNE:
case CmpInst::FCMP_TRUE:
return false;
default:
return true;
}
}
void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
B.setInstrAndDebugLoc(MI);
auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
};
switch (Info.Pred) {
case CmpInst::FCMP_ULT:
case CmpInst::FCMP_ULE:
if (Info.LHS == Info.True)
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
else
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
break;
case CmpInst::FCMP_OLE:
case CmpInst::FCMP_OLT: {
// We need to permute the operands to get the correct NaN behavior. The
// selected operand is the second one based on the failing compare with NaN,
// so permute it based on the compare type the hardware uses.
if (Info.LHS == Info.True)
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
else
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
break;
}
case CmpInst::FCMP_UGE:
case CmpInst::FCMP_UGT: {
if (Info.LHS == Info.True)
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
else
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
break;
}
case CmpInst::FCMP_OGT:
case CmpInst::FCMP_OGE: {
if (Info.LHS == Info.True)
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
else
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
break;
}
default:
llvm_unreachable("predicate should not have matched");
}
MI.eraseFromParent();
}
bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
Register DstReg = MI.getOperand(0).getReg();
// TODO: We could try to match extracting the higher bytes, which would be
// easier if i8 vectors weren't promoted to i32 vectors, particularly after
// types are legalized. v4i8 -> v4f32 is probably the only case to worry
// about in practice.
LLT Ty = MRI.getType(DstReg);
if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
Register SrcReg = MI.getOperand(1).getReg();
unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
}
return false;
}
void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
B.setInstrAndDebugLoc(MI);
const LLT S32 = LLT::scalar(32);
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(DstReg);
LLT SrcTy = MRI.getType(SrcReg);
if (SrcTy != S32)
SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
if (Ty == S32) {
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
{SrcReg}, MI.getFlags());
} else {
auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
{SrcReg}, MI.getFlags());
B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
}
MI.eraseFromParent();
}
bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
auto getRcpSrc = [=](const MachineInstr &MI) {
MachineInstr *ResMI = nullptr;
if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
ResMI = MRI.getVRegDef(MI.getOperand(2).getReg());
return ResMI;
};
auto getSqrtSrc = [=](const MachineInstr &MI) {
MachineInstr *SqrtSrcMI = nullptr;
auto Match =
mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
(void)Match;
return SqrtSrcMI;
};
MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
// rcp(sqrt(x))
if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
.addUse(SqrtSrcMI->getOperand(0).getReg())
.setMIFlags(MI.getFlags());
};
return true;
}
// sqrt(rcp(x))
if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
.addUse(RcpSrcMI->getOperand(0).getReg())
.setMIFlags(MI.getFlags());
};
return true;
}
return false;
}
bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
Register SrcReg = MI.getOperand(1).getReg();
// Look through G_ZEXT.
bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
Register Src0;
int64_t ShiftAmt;
IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
unsigned ShiftOffset = 8 * Offset;
if (IsShr)
ShiftOffset += ShiftAmt;
else
ShiftOffset -= ShiftAmt;
MatchInfo.CvtVal = Src0;
MatchInfo.ShiftOffset = ShiftOffset;
return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
}
// TODO: Simplify demanded bits.
return false;
}
void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
B.setInstrAndDebugLoc(MI);
unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
const LLT S32 = LLT::scalar(32);
Register CvtSrc = MatchInfo.CvtVal;
LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
if (SrcTy != S32) {
assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
}
assert(MI.getOpcode() != NewOpc);
B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
MI.eraseFromParent();
}
bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
MachineInstr &MI, Register &Reg) {
const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
MF.getSubtarget().getTargetLowering());
Reg = MI.getOperand(1).getReg();
return TLI->isCanonicalized(Reg, MF);
}
class AMDGPUPostLegalizerCombinerHelperState {
protected:
AMDGPUCombinerHelper &Helper;
AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
// Note: pointer is necessary because Target Predicates use
// "Subtarget->"
const GCNSubtarget *Subtarget;
public:
AMDGPUPostLegalizerCombinerHelperState(
AMDGPUCombinerHelper &Helper,
AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper,
const GCNSubtarget &Subtarget)
: Helper(Helper), PostLegalizerHelper(PostLegalizerHelper),
Subtarget(&Subtarget) {}
};
#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
#include "AMDGPUGenPostLegalizeGICombiner.inc"
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
namespace {
#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
#include "AMDGPUGenPostLegalizeGICombiner.inc"
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
GISelKnownBits *KB;
MachineDominatorTree *MDT;
const GCNSubtarget &Subtarget;
public:
AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
AMDGPUPostLegalizerCombinerInfo(const GCNSubtarget &Subtarget, bool EnableOpt,
bool OptSize, bool MinSize,
const AMDGPULegalizerInfo *LI,
GISelKnownBits *KB, MachineDominatorTree *MDT)
: CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
/*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
KB(KB), MDT(MDT), Subtarget(Subtarget) {
if (!GeneratedRuleCfg.parseCommandLineOption())
report_fatal_error("Invalid rule identifier");
}
bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
MachineIRBuilder &B) const override;
};
bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
MachineInstr &MI,
MachineIRBuilder &B) const {
AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ false, KB, MDT,
LInfo);
AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
AMDGPUGenPostLegalizerCombinerHelper Generated(
GeneratedRuleCfg, Helper, PostLegalizerHelper, Subtarget);
if (Generated.tryCombineAll(Observer, MI, B))
return true;
switch (MI.getOpcode()) {
case TargetOpcode::G_SHL:
case TargetOpcode::G_LSHR:
case TargetOpcode::G_ASHR:
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
// common case, splitting this into a move and a 32-bit shift is faster and
// the same code size.
return Helper.tryCombineShiftToUnmerge(MI, 32);
}
return false;
}
#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
#include "AMDGPUGenPostLegalizeGICombiner.inc"
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
// Pass boilerplate
// ================
class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
public:
static char ID;
AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
StringRef getPassName() const override {
return "AMDGPUPostLegalizerCombiner";
}
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
private:
bool IsOptNone;
};
} // end anonymous namespace
void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetPassConfig>();
AU.setPreservesCFG();
getSelectionDAGFallbackAnalysisUsage(AU);
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
if (!IsOptNone) {
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
}
MachineFunctionPass::getAnalysisUsage(AU);
}
AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
: MachineFunctionPass(ID), IsOptNone(IsOptNone) {
initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
}
bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
if (MF.getProperties().hasProperty(
MachineFunctionProperties::Property::FailedISel))
return false;
auto *TPC = &getAnalysis<TargetPassConfig>();
const Function &F = MF.getFunction();
bool EnableOpt =
MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const AMDGPULegalizerInfo *LI
= static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
MachineDominatorTree *MDT =
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
AMDGPUPostLegalizerCombinerInfo PCInfo(ST, EnableOpt, F.hasOptSize(),
F.hasMinSize(), LI, KB, MDT);
Combiner C(PCInfo, TPC);
return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
}
char AMDGPUPostLegalizerCombiner::ID = 0;
INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
"Combine AMDGPU machine instrs after legalization",
false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
"Combine AMDGPU machine instrs after legalization", false,
false)
namespace llvm {
FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
return new AMDGPUPostLegalizerCombiner(IsOptNone);
}
} // end namespace llvm