243 lines
8.6 KiB
C++
243 lines
8.6 KiB
C++
//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
|
|
/// instructions that produce single-use VGPR values. If the value is forwarded
|
|
/// to the consumer instruction prior to VGPR writeback, the hardware can
|
|
/// then skip (kill) the VGPR write.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "SIInstrInfo.h"
|
|
#include "SIRegisterInfo.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/StringRef.h"
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
#include "llvm/CodeGen/MachineOperand.h"
|
|
#include "llvm/CodeGen/Register.h"
|
|
#include "llvm/IR/DebugLoc.h"
|
|
#include "llvm/MC/MCRegister.h"
|
|
#include "llvm/MC/MCRegisterInfo.h"
|
|
#include "llvm/Pass.h"
|
|
#include <array>
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "amdgpu-insert-single-use-vdst"
|
|
|
|
namespace {
|
|
class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
|
|
private:
|
|
const SIInstrInfo *SII;
|
|
class SingleUseInstruction {
|
|
private:
|
|
static const unsigned MaxSkipRange = 0b111;
|
|
static const unsigned MaxNumberOfSkipRegions = 2;
|
|
|
|
unsigned LastEncodedPositionEnd;
|
|
MachineInstr *ProducerInstr;
|
|
|
|
std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions;
|
|
SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions;
|
|
|
|
// Adds a skip region into the instruction.
|
|
void skip(const unsigned ProducerPosition) {
|
|
while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) {
|
|
SkipRegions.push_back(MaxSkipRange);
|
|
LastEncodedPositionEnd += MaxSkipRange;
|
|
}
|
|
SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd);
|
|
LastEncodedPositionEnd = ProducerPosition;
|
|
}
|
|
|
|
bool currentRegionHasSpace() {
|
|
const auto Region = SkipRegions.size();
|
|
// The first region has an extra bit of encoding space.
|
|
return SingleUseRegions[Region] <
|
|
((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U);
|
|
}
|
|
|
|
unsigned encodeImm() {
|
|
// Handle the first Single Use Region separately as it has an extra bit
|
|
// of encoding space.
|
|
unsigned Imm = SingleUseRegions[SkipRegions.size()];
|
|
unsigned ShiftAmount = 4;
|
|
for (unsigned i = SkipRegions.size(); i > 0; i--) {
|
|
Imm |= SkipRegions[i - 1] << ShiftAmount;
|
|
ShiftAmount += 3;
|
|
Imm |= SingleUseRegions[i - 1] << ShiftAmount;
|
|
ShiftAmount += 3;
|
|
}
|
|
return Imm;
|
|
}
|
|
|
|
public:
|
|
SingleUseInstruction(const unsigned ProducerPosition,
|
|
MachineInstr *Producer)
|
|
: LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer),
|
|
SingleUseRegions({1, 0, 0}) {}
|
|
|
|
// Returns false if adding a new single use producer failed. This happens
|
|
// because it could not be encoded, either because there is no room to
|
|
// encode another single use producer region or that this single use
|
|
// producer is too far away to encode the amount of instructions to skip.
|
|
bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) {
|
|
// Producer is too far away to encode into this instruction or another
|
|
// skip region is needed and SkipRegions.size() = 2 so there's no room for
|
|
// another skip region, therefore a new instruction is needed.
|
|
if (LastEncodedPositionEnd +
|
|
(MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) <
|
|
ProducerPosition)
|
|
return false;
|
|
|
|
// If a skip region is needed.
|
|
if (LastEncodedPositionEnd != ProducerPosition ||
|
|
!currentRegionHasSpace()) {
|
|
// If the current region is out of space therefore a skip region would
|
|
// be needed, but there is no room for another skip region.
|
|
if (SkipRegions.size() == MaxNumberOfSkipRegions)
|
|
return false;
|
|
skip(ProducerPosition);
|
|
}
|
|
|
|
SingleUseRegions[SkipRegions.size()]++;
|
|
LastEncodedPositionEnd = ProducerPosition + 1;
|
|
ProducerInstr = MI;
|
|
return true;
|
|
}
|
|
|
|
auto emit(const SIInstrInfo *SII) {
|
|
return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(),
|
|
SII->get(AMDGPU::S_SINGLEUSE_VDST))
|
|
.addImm(encodeImm());
|
|
}
|
|
};
|
|
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
|
|
|
|
void insertSingleUseInstructions(
|
|
ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
|
|
SmallVector<SingleUseInstruction> Instructions;
|
|
|
|
for (auto &[Position, MI] : SingleUseProducers) {
|
|
// Encode this position into the last single use instruction if possible.
|
|
if (Instructions.empty() ||
|
|
!Instructions.back().tryAddProducer(Position, MI)) {
|
|
// If not, add a new instruction.
|
|
Instructions.push_back(SingleUseInstruction(Position, MI));
|
|
}
|
|
}
|
|
|
|
for (auto &Instruction : Instructions)
|
|
Instruction.emit(SII);
|
|
}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
const auto &ST = MF.getSubtarget<GCNSubtarget>();
|
|
if (!ST.hasVGPRSingleUseHintInsts())
|
|
return false;
|
|
|
|
SII = ST.getInstrInfo();
|
|
const auto *TRI = &SII->getRegisterInfo();
|
|
bool InstructionEmitted = false;
|
|
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
DenseMap<MCRegUnit, unsigned> RegisterUseCount;
|
|
|
|
// Handle boundaries at the end of basic block separately to avoid
|
|
// false positives. If they are live at the end of a basic block then
|
|
// assume it has more uses later on.
|
|
for (const auto &Liveout : MBB.liveouts()) {
|
|
for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid();
|
|
++Units) {
|
|
const auto [Unit, Mask] = *Units;
|
|
if ((Mask & Liveout.LaneMask).any())
|
|
RegisterUseCount[Unit] = 2;
|
|
}
|
|
}
|
|
|
|
SmallVector<std::pair<unsigned, MachineInstr *>>
|
|
SingleUseProducerPositions;
|
|
|
|
unsigned VALUInstrCount = 0;
|
|
for (MachineInstr &MI : reverse(MBB.instrs())) {
|
|
// All registers in all operands need to be single use for an
|
|
// instruction to be marked as a single use producer.
|
|
bool AllProducerOperandsAreSingleUse = true;
|
|
|
|
// Gather a list of Registers used before updating use counts to avoid
|
|
// double counting registers that appear multiple times in a single
|
|
// MachineInstr.
|
|
SmallVector<MCRegUnit> RegistersUsed;
|
|
|
|
for (const auto &Operand : MI.all_defs()) {
|
|
const auto Reg = Operand.getReg();
|
|
|
|
const auto RegUnits = TRI->regunits(Reg);
|
|
if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) {
|
|
return RegisterUseCount[Unit] > 1;
|
|
}))
|
|
AllProducerOperandsAreSingleUse = false;
|
|
|
|
// Reset uses count when a register is no longer live.
|
|
for (const MCRegUnit Unit : RegUnits)
|
|
RegisterUseCount.erase(Unit);
|
|
}
|
|
|
|
for (const auto &Operand : MI.all_uses()) {
|
|
const auto Reg = Operand.getReg();
|
|
|
|
// Count the number of times each register is read.
|
|
for (const MCRegUnit Unit : TRI->regunits(Reg)) {
|
|
if (!is_contained(RegistersUsed, Unit))
|
|
RegistersUsed.push_back(Unit);
|
|
}
|
|
}
|
|
for (const MCRegUnit Unit : RegistersUsed)
|
|
RegisterUseCount[Unit]++;
|
|
|
|
// Do not attempt to optimise across exec mask changes.
|
|
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
|
|
for (auto &UsedReg : RegisterUseCount)
|
|
UsedReg.second = 2;
|
|
}
|
|
|
|
if (!SIInstrInfo::isVALU(MI))
|
|
continue;
|
|
if (AllProducerOperandsAreSingleUse) {
|
|
SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
|
|
InstructionEmitted = true;
|
|
}
|
|
VALUInstrCount++;
|
|
}
|
|
insertSingleUseInstructions(SingleUseProducerPositions);
|
|
}
|
|
return InstructionEmitted;
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
char AMDGPUInsertSingleUseVDST::ID = 0;
|
|
|
|
char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;
|
|
|
|
INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
|
|
"AMDGPU Insert SingleUseVDST", false, false)
|