Files
clang-p2996/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
Petar Avramovic 0ee037b861 AMDGPU/GlobalISel: AMDGPURegBankLegalize (#112864)
Lower G_ instructions that can't be inst-selected with register bank
assignment from AMDGPURegBankSelect based on uniformity analysis.
- Lower instruction to perform it on assigned register bank
- Put uniform value in vgpr because SALU instruction is not available
- Execute divergent instruction in SALU - "waterfall loop"

Given LLTs on all operands after legalizer, some register bank
assignments require lowering while other do not.
Note: cases where all register bank assignments would require lowering
are lowered in legalizer.

AMDGPURegBankLegalize goals:
- Define Rules: when and how to perform lowering
- Goal of defining Rules it to provide high level table-like brief
  overview of how to lower generic instructions based on available
  target features and uniformity info (uniform vs divergent).
- Fast search of Rules, depends on how complicated Rule.Predicate is
- For some opcodes there would be too many Rules that are essentially
  all the same just for different combinations of types and banks.
  Write custom function that handles all cases.
- Rules are made from enum IDs that correspond to each operand.
  Names of IDs are meant to give brief description what lowering does
  for each operand or the whole instruction.
- AMDGPURegBankLegalizeHelper implements lowering algorithms

Since this is the first patch that actually enables -new-reg-bank-select
here is the summary of regression tests that were added earlier:
- if instruction is uniform always select SALU instruction if available
- eliminate back to back vgpr to sgpr to vgpr copies of uniform values
- fast rules: small differences for standard and vector instruction
- enabling Rule based on target feature - salu_float
- how to specify lowering algorithm - vgpr S64 AND to S32
- on G_TRUNC in reg, it is up to user to deal with truncated bits
  G_TRUNC in reg is treated as no-op.
- dealing with truncated high bits - ABS S16 to S32
- sgpr S1 phi lowering
- new opcodes for vcc-to-scc and scc-to-vcc copies
- lowering for vgprS1-to-vcc copy (formally this is vgpr-to-vcc G_TRUNC)
- S1 zext and sext lowering to select
- uniform and divergent S1 AND(OR and XOR) lowering - inst-selected into
  SALU instruction
- divergent phi with uniform inputs
- divergent instruction with temporal divergent use, source instruction
  is defined as uniform(AMDGPURegBankSelect) - missing temporal
  divergence lowering
- uniform phi, because of undef incoming, is assigned to vgpr. Will be
  fixed in AMDGPURegBankSelect via another fix in machine uniformity
  analysis.
2025-01-24 12:12:45 +01:00

168 lines
5.9 KiB
C++

//===- AMDGPUGlobalISelUtils.cpp ---------------------------------*- C++ -*-==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPURegisterBankInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGenTypes/LowLevelType.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
using namespace llvm;
using namespace AMDGPU;
using namespace MIPatternMatch;
std::pair<Register, unsigned>
AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
GISelKnownBits *KnownBits, bool CheckNUW) {
MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
if (Def->getOpcode() == TargetOpcode::G_CONSTANT) {
unsigned Offset;
const MachineOperand &Op = Def->getOperand(1);
if (Op.isImm())
Offset = Op.getImm();
else
Offset = Op.getCImm()->getZExtValue();
return std::pair(Register(), Offset);
}
int64_t Offset;
if (Def->getOpcode() == TargetOpcode::G_ADD) {
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
// wraparound, because s_load instructions perform the addition in 64 bits.
if (CheckNUW && !Def->getFlag(MachineInstr::NoUWrap)) {
assert(MRI.getType(Reg).getScalarSizeInBits() == 32);
return std::pair(Reg, 0);
}
// TODO: Handle G_OR used for add case
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ICst(Offset)))
return std::pair(Def->getOperand(1).getReg(), Offset);
// FIXME: matcher should ignore copies
if (mi_match(Def->getOperand(2).getReg(), MRI, m_Copy(m_ICst(Offset))))
return std::pair(Def->getOperand(1).getReg(), Offset);
}
Register Base;
if (KnownBits && mi_match(Reg, MRI, m_GOr(m_Reg(Base), m_ICst(Offset))) &&
KnownBits->maskedValueIsZero(Base, APInt(32, Offset)))
return std::pair(Base, Offset);
// Handle G_PTRTOINT (G_PTR_ADD base, const) case
if (Def->getOpcode() == TargetOpcode::G_PTRTOINT) {
MachineInstr *Base;
if (mi_match(Def->getOperand(1).getReg(), MRI,
m_GPtrAdd(m_MInstr(Base), m_ICst(Offset)))) {
// If Base was int converted to pointer, simply return int and offset.
if (Base->getOpcode() == TargetOpcode::G_INTTOPTR)
return std::pair(Base->getOperand(1).getReg(), Offset);
// Register returned here will be of pointer type.
return std::pair(Base->getOperand(0).getReg(), Offset);
}
}
return std::pair(Reg, 0);
}
IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF)
: MRI(MF.getRegInfo()) {
initLaneMaskIntrinsics(MF);
}
bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) const {
return S32S64LaneMask.contains(Reg);
}
void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
for (auto &MBB : MF) {
for (auto &MI : MBB) {
GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI);
if (GI && GI->is(Intrinsic::amdgcn_if_break)) {
S32S64LaneMask.insert(MI.getOperand(3).getReg());
findLCSSAPhi(MI.getOperand(0).getReg());
}
if (MI.getOpcode() == AMDGPU::SI_IF ||
MI.getOpcode() == AMDGPU::SI_ELSE) {
findLCSSAPhi(MI.getOperand(0).getReg());
}
}
}
}
void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
S32S64LaneMask.insert(Reg);
for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) {
if (LCSSAPhi.isPHI())
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
}
}
static LLT getReadAnyLaneSplitTy(LLT Ty) {
if (Ty.isVector()) {
LLT ElTy = Ty.getElementType();
if (ElTy.getSizeInBits() == 16)
return LLT::fixed_vector(2, ElTy);
// S32, S64 or pointer
return ElTy;
}
// Large scalars and 64-bit pointers
return LLT::scalar(32);
}
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
const RegisterBankInfo &RBI);
static void unmergeReadAnyLane(MachineIRBuilder &B,
SmallVectorImpl<Register> &SgprDstParts,
LLT UnmergeTy, Register VgprSrc,
const RegisterBankInfo &RBI) {
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
}
}
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
const RegisterBankInfo &RBI) {
LLT Ty = B.getMRI()->getType(VgprSrc);
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
if (Ty.getSizeInBits() == 32) {
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
.getReg(0);
}
SmallVector<Register, 8> SgprDstParts;
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
}
void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
Register VgprSrc, const RegisterBankInfo &RBI) {
LLT Ty = B.getMRI()->getType(VgprSrc);
if (Ty.getSizeInBits() == 32) {
B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
return;
}
SmallVector<Register, 8> SgprDstParts;
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
}