[AMDGPU] Instruction Type Pipeline

This patch implements a DAG mutation which adds edges between different groups of instructions. The purpose is to try to generate code that conforms to a pipeline (groupA instructions occur before groupB, groupB -> groupC, and so on). Currently the pipeline order is hardcoded as VMEM->DSRead->MFMA->DSWrite, but the patch was designed to be easily extensible. Alias analysis is problematic for pipelining as memory instructions will usually not be able to be reordered w.r.t one another.

Differential Revision: https://reviews.llvm.org/D125997
This commit is contained in:
jeff
2022-05-16 11:13:20 -07:00
committed by jef
parent a0ef52cc10
commit 2e61dfb124
8 changed files with 411 additions and 607 deletions

View File

@@ -1,173 +0,0 @@
//===--- AMDGPUMFMAClusting.cpp - AMDGPU MFMA Clustering -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file This file contains a DAG scheduling mutation to cluster MFMA
/// instructions.
//
//===----------------------------------------------------------------------===//
#include "AMDGPUMFMAClustering.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineScheduler.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-mfma-clustering"
namespace {
static cl::opt<bool> EnableMFMACluster("amdgpu-mfma-cluster",
cl::desc("Enable MFMA clustering"),
cl::init(false));
static cl::opt<unsigned>
MaxMFMAClusterSize("amdgpu-mfma-cluster-size", cl::init(5), cl::Hidden,
cl::desc("The maximum number of MFMA instructions to "
"attempt to cluster together."));
class MFMAClusterDAGMutation : public ScheduleDAGMutation {
const SIInstrInfo *TII;
ScheduleDAGMI *DAG;
public:
MFMAClusterDAGMutation() = default;
void apply(ScheduleDAGInstrs *DAGInstrs) override;
};
static void collectMFMASUnits(SmallVectorImpl<SUnit *> &MFMASUnits,
const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) {
for (SUnit &SU : DAG->SUnits) {
MachineInstr &MAI = *SU.getInstr();
if (!TII->isMAI(MAI) ||
MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
continue;
MFMASUnits.push_back(&SU);
LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU););
}
// Sorting the MFMAs in NodeNum order results in a good clustering order
std::sort(MFMASUnits.begin(), MFMASUnits.end(),
[](SUnit *a, SUnit *b) { return a->NodeNum < b->NodeNum; });
}
static void propagateDeps(DenseMap<unsigned, unsigned> &SUnit2ClusterInfo,
llvm::ArrayRef<SDep> ClusterPreds,
llvm::ArrayRef<SDep> ClusterSuccs,
unsigned ClusterNum, ScheduleDAGInstrs *DAG) {
for (auto Node : SUnit2ClusterInfo) {
if (Node.second != ClusterNum)
continue; // Only add the combined succs to the current cluster
LLVM_DEBUG(dbgs() << "Copying Deps To SU(" << Node.first << ")\n");
for (const SDep &Succ : ClusterSuccs) {
LLVM_DEBUG(dbgs() << "Copying Succ SU(" << Succ.getSUnit()->NodeNum
<< ")\n");
DAG->addEdge(Succ.getSUnit(),
SDep(&DAG->SUnits[Node.first], SDep::Artificial));
}
for (const SDep &Pred : ClusterPreds) {
LLVM_DEBUG(dbgs() << "Copying Pred SU(" << Pred.getSUnit()->NodeNum
<< ")\n");
if (Pred.getSUnit()->NodeNum == ClusterNum)
continue;
DAG->addEdge(&DAG->SUnits[Node.first],
SDep(Pred.getSUnit(), SDep::Artificial));
}
}
}
static void clusterNeighboringMFMAs(llvm::ArrayRef<SUnit *> MFMASUnits,
ScheduleDAGInstrs *DAG) {
DenseMap<unsigned, unsigned> SUnit2ClusterInfo;
for (unsigned Idx = 0, End = MFMASUnits.size(); Idx < (End - 1); ++Idx) {
if (SUnit2ClusterInfo.count(MFMASUnits[Idx]->NodeNum))
continue; // We don't want to cluster against a different cluster
auto MFMAOpa = MFMASUnits[Idx];
auto ClusterBase = MFMAOpa;
unsigned ClusterNum = ClusterBase->NodeNum;
SmallVector<SDep, 4> ClusterSuccs(MFMAOpa->Succs);
SmallVector<SDep, 4> ClusterPreds(MFMAOpa->Preds);
unsigned NextIdx = Idx + 1;
unsigned ClusterSize = 1;
// Attempt to cluster all the remaining MFMASunits in a chain
// starting at ClusterBase/MFMAOpa.
for (; NextIdx < End; ++NextIdx) {
if (ClusterSize >= MaxMFMAClusterSize || NextIdx >= End)
break;
// Only add independent MFMAs that have not been previously clustered
if (SUnit2ClusterInfo.count(MFMASUnits[NextIdx]->NodeNum) ||
DAG->IsReachable(MFMASUnits[NextIdx], ClusterBase) ||
DAG->IsReachable(ClusterBase, MFMASUnits[NextIdx]))
continue;
auto MFMAOpb = MFMASUnits[NextIdx];
// Aggregate the cluster inst dependencies for dep propogation
ClusterPreds.append(MFMAOpb->Preds);
ClusterSuccs.append(MFMAOpb->Succs);
if (!DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Cluster)))
continue;
// Enforce ordering to ensure root/leaf of cluster chain gets
// scheduled first/last
DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Artificial));
LLVM_DEBUG(dbgs() << "Cluster MFMA SU(" << MFMAOpa->NodeNum << ") - SU("
<< MFMAOpb->NodeNum << ")\n");
SUnit2ClusterInfo[MFMAOpb->NodeNum] = ClusterNum;
SUnit2ClusterInfo[MFMAOpa->NodeNum] = ClusterNum;
++ClusterSize;
MFMAOpa = MFMAOpb;
}
propagateDeps(SUnit2ClusterInfo, ClusterPreds, ClusterSuccs, ClusterNum,
DAG);
}
}
void MFMAClusterDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
if (!ST.hasMAIInsts())
return;
DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
if (!TSchedModel || DAG->SUnits.empty())
return;
SmallVector<SUnit *, 32> MFMASUnits;
collectMFMASUnits(MFMASUnits, TII, DAG);
if (MFMASUnits.size() < 2)
return;
clusterNeighboringMFMAs(MFMASUnits, DAG);
}
} // namespace
namespace llvm {
std::unique_ptr<ScheduleDAGMutation> createMFMAClusterDAGMutation() {
return EnableMFMACluster ? std::make_unique<MFMAClusterDAGMutation>()
: nullptr;
}
} // end namespace llvm

View File

@@ -0,0 +1,219 @@
//===--- AMDGPUMFMAIGroupLP.cpp - AMDGPU MFMA IGroupLP ------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// \file This file contains a DAG scheduling mutation which tries to coerce
// the scheduler into generating an ordering based on ordering of groups
// of instructions.
//
//===----------------------------------------------------------------------===//
#include "AMDGPUMFMAIGroupLP.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/TargetOpcodes.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-MFMA-IGroupLP"
namespace {
static cl::opt<bool>
EnableMFMAIGroupLP("amdgpu-mfma-igrouplp",
cl::desc("Enable construction of Instruction Groups and "
"their ordering for scheduling"),
cl::init(false));
static cl::opt<int>
VMEMGroupMaxSize("amdgpu-mfma-igrouplp-vmem-group-size", cl::init(-1),
cl::Hidden,
cl::desc("The maximum number of instructions to include "
"in VMEM group."));
static cl::opt<int>
MFMAGroupMaxSize("amdgpu-mfma-igrouplp-mfma-group-size", cl::init(-1),
cl::Hidden,
cl::desc("The maximum number of instructions to include "
"in MFMA group."));
static cl::opt<int>
LDRGroupMaxSize("amdgpu-mfma-igrouplp-ldr-group-size", cl::init(-1),
cl::Hidden,
cl::desc("The maximum number of instructions to include "
"in lds/gds read group."));
static cl::opt<int>
LDWGroupMaxSize("amdgpu-mfma-igrouplp-ldw-group-size", cl::init(-1),
cl::Hidden,
cl::desc("The maximum number of instructions to include "
"in lds/gds write group."));
typedef function_ref<bool(const MachineInstr &)> IsInstructionType;
struct InstructionClass {
SmallVector<SUnit *, 32> Collection;
const IsInstructionType isInstructionClass;
// MaxSize is initialized to -1 by default, if MaxSize is < 0, then
// the collection will not have a size limit
const int MaxSize;
InstructionClass(IsInstructionType IsInstructionClass, int maxSize)
: isInstructionClass(IsInstructionClass), MaxSize(maxSize){};
bool IsFull() { return !(MaxSize <= 0) && (int)Collection.size() >= MaxSize; }
};
class MFMAIGroupLPDAGMutation : public ScheduleDAGMutation {
public:
const SIInstrInfo *TII;
ScheduleDAGMI *DAG;
MFMAIGroupLPDAGMutation() = default;
void apply(ScheduleDAGInstrs *DAGInstrs) override;
};
static void collectSUnits(SmallVectorImpl<InstructionClass *> &PipelineOrder,
const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) {
for (SUnit &SU : DAG->SUnits) {
LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
// Presently, a bundle only counts as one instruction towards
// the group's maximum size
if (SU.getInstr()->getOpcode() == TargetOpcode::BUNDLE) {
MachineInstr *MI = SU.getInstr();
MachineBasicBlock::instr_iterator BundledMI = MI->getIterator();
++BundledMI;
LLVM_DEBUG(dbgs() << "Checking bundled insts\n";);
InstructionClass *MatchingStage = nullptr;
for (auto Stage : PipelineOrder) {
if (Stage->isInstructionClass(*BundledMI) && !Stage->IsFull()) {
MatchingStage = Stage;
break;
}
}
if (MatchingStage != nullptr) {
while (MatchingStage->isInstructionClass(*BundledMI)) {
if (!BundledMI->isBundledWithSucc())
break;
++BundledMI;
}
if (!BundledMI->isBundledWithSucc()) {
LLVM_DEBUG(dbgs() << "Bundle is all of same type\n";);
MatchingStage->Collection.push_back(&SU);
}
}
}
for (InstructionClass *Stage : PipelineOrder) {
if (Stage->isInstructionClass(*SU.getInstr()) && !Stage->IsFull()) {
Stage->Collection.push_back(&SU);
}
}
}
}
static void
addPipelineEdges(const llvm::ArrayRef<InstructionClass *> PipelineOrder,
ScheduleDAGInstrs *DAG) {
for (int i = 0; i < (int)PipelineOrder.size() - 1; i++) {
auto StageA = PipelineOrder[i];
for (int j = i + 1; j < (int)PipelineOrder.size(); j++) {
auto StageB = PipelineOrder[j];
for (auto SUnitA : StageA->Collection) {
LLVM_DEBUG(dbgs() << "Adding edges for: "; DAG->dumpNode(*SUnitA););
for (auto SUnitB : StageB->Collection) {
if (DAG->canAddEdge(SUnitB, SUnitA)) {
DAG->addEdge(SUnitB, SDep(SUnitA, SDep::Artificial));
LLVM_DEBUG(dbgs() << "Added edge to: "; DAG->dumpNode(*SUnitB););
} else {
LLVM_DEBUG(dbgs() << "Can't add edge to: ";
DAG->dumpNode(*SUnitB););
}
}
}
}
}
}
void MFMAIGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
if (!ST.hasMAIInsts())
return;
DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
if (!TSchedModel || DAG->SUnits.empty())
return;
const IsInstructionType isMFMAFn = [this](const MachineInstr &MI) {
if (TII->isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) {
LLVM_DEBUG(dbgs() << "Found MFMA\n";);
return true;
}
return false;
};
InstructionClass MFMASUnits(isMFMAFn, MFMAGroupMaxSize);
const IsInstructionType isVMEMReadFn = [this](const MachineInstr &MI) {
if (((TII->isFLAT(MI) && !TII->isDS(MI)) || TII->isVMEM(MI)) &&
MI.mayLoad()) {
LLVM_DEBUG(dbgs() << "Found VMEM read\n";);
return true;
}
return false;
};
InstructionClass VMEMReadSUnits(isVMEMReadFn, VMEMGroupMaxSize);
const IsInstructionType isDSWriteFn = [this](const MachineInstr &MI) {
if (TII->isDS(MI) && MI.mayStore()) {
LLVM_DEBUG(dbgs() << "Found DS Write\n";);
return true;
}
return false;
};
InstructionClass DSWriteSUnits(isDSWriteFn, LDWGroupMaxSize);
const IsInstructionType isDSReadFn = [this](const MachineInstr &MI) {
if (TII->isDS(MI) && MI.mayLoad()) {
LLVM_DEBUG(dbgs() << "Found DS Read\n";);
return true;
}
return false;
};
InstructionClass DSReadSUnits(isDSReadFn, LDRGroupMaxSize);
// The order of InstructionClasses in this vector defines the
// order in which edges will be added. In other words, given the
// present ordering, we will try to make each VMEMRead instruction
// a predecessor of each DSRead instruction, and so on.
SmallVector<InstructionClass *, 4> PipelineOrder = {
&VMEMReadSUnits, &DSReadSUnits, &MFMASUnits, &DSWriteSUnits};
collectSUnits(PipelineOrder, TII, DAG);
addPipelineEdges(PipelineOrder, DAG);
}
} // namespace
namespace llvm {
std::unique_ptr<ScheduleDAGMutation> createMFMAIGroupLPDAGMutation() {
return EnableMFMAIGroupLP ? std::make_unique<MFMAIGroupLPDAGMutation>()
: nullptr;
}
} // end namespace llvm

View File

@@ -1,4 +1,4 @@
//===- AMDGPUMFMAClustering.h - AMDGPU MFMA Clustering ------*- C++ -*-===//
//===- AMDGPUMFMAIGroupLP.h - AMDGPU MFMA IGroupLP --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,16 +6,16 @@
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include <memory>
namespace llvm {
std::unique_ptr<ScheduleDAGMutation> createMFMAClusterDAGMutation();
std::unique_ptr<ScheduleDAGMutation> createMFMAIGroupLPDAGMutation();
} // namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H

View File

@@ -16,7 +16,7 @@
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUMFMAClustering.h"
#include "AMDGPUMFMAIGroupLP.h"
#include "AMDGPUMacroFusion.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
@@ -399,7 +399,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createMFMAClusterDAGMutation());
DAG->addMutation(createMFMAIGroupLPDAGMutation());
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@@ -881,7 +881,7 @@ public:
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
DAG->addMutation(createMFMAClusterDAGMutation());
DAG->addMutation(createMFMAIGroupLPDAGMutation());
return DAG;
}

View File

@@ -75,7 +75,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMachineModuleInfo.cpp
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUMFMAClustering.cpp
AMDGPUMFMAIGroupLP.cpp
AMDGPUMIRFormatter.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUPerfHintAnalysis.cpp

View File

@@ -1,71 +0,0 @@
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1 | FileCheck -check-prefix=PRERA %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 --debug-only=amdgpu-mfma-clustering 2>&1 | FileCheck -check-prefix=TWOLIMIT %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1| FileCheck -check-prefix=POSTRA %s
# REQUIRES: asserts
# PRERA: Cluster MFMA SU(2) - SU(6)
# PRERA-NEXT: Cluster MFMA SU(6) - SU(10)
# PRERA-NEXT: Cluster MFMA SU(10) - SU(12)
# TWOLIMIT: Cluster MFMA SU(2) - SU(6)
# TWOLIMIT: Cluster MFMA SU(10) - SU(11)
# POSTRA: Cluster MFMA SU(2) - SU(6)
# POSTRA-NEXT: Cluster MFMA SU(6) - SU(10)
# POSTRA-NEXT: Cluster MFMA SU(10) - SU(12)
---
name: basic_cluster
tracksRegLiveness: true
body: |
bb.0:
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...
# PRERA: Cluster MFMA SU(12) - SU(16)
# PRERA-NEXT: Cluster MFMA SU(16) - SU(20)
# POSTRA: Cluster MFMA SU(12) - SU(16)
# POSTRA-NEXT: Cluster MFMA SU(16) - SU(20)
---
name: complex_cluster
tracksRegLiveness: true
body: |
bb.0:
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
$vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...

View File

@@ -1,354 +0,0 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=PRERA %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=BOTHSCHEDPASS %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 2>&1 | FileCheck -check-prefix=TWOLIMIT %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1| FileCheck -check-prefix=POSTRA %s
---
name: no_cluster
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0, $vgpr10_vgpr11
; PRERA-LABEL: name: no_cluster
; PRERA: liveins: $sgpr0, $vgpr10_vgpr11
; PRERA-NEXT: {{ $}}
; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; PRERA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; DEFAULT-LABEL: name: no_cluster
; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11
; DEFAULT-NEXT: {{ $}}
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; BOTHSCHEDPASS-LABEL: name: no_cluster
; BOTHSCHEDPASS: liveins: $sgpr0, $vgpr10_vgpr11
; BOTHSCHEDPASS-NEXT: {{ $}}
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
; TWOLIMIT-LABEL: name: no_cluster
; TWOLIMIT: liveins: $sgpr0, $vgpr10_vgpr11
; TWOLIMIT-NEXT: {{ $}}
; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; TWOLIMIT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; POSTRA-LABEL: name: no_cluster
; POSTRA: liveins: $sgpr0, $vgpr10_vgpr11
; POSTRA-NEXT: {{ $}}
; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
$vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
$vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
...
---
name: basic_cluster
tracksRegLiveness: true
body: |
bb.0:
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; PRERA-LABEL: name: basic_cluster
; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; PRERA-NEXT: {{ $}}
; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-LABEL: name: basic_cluster
; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; DEFAULT-NEXT: {{ $}}
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-LABEL: name: basic_cluster
; BOTHSCHEDPASS: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; BOTHSCHEDPASS-NEXT: {{ $}}
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-LABEL: name: basic_cluster
; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; TWOLIMIT-NEXT: {{ $}}
; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-LABEL: name: basic_cluster
; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; POSTRA-NEXT: {{ $}}
; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...
---
name: complex_cluster
tracksRegLiveness: true
body: |
bb.0:
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
; PRERA-LABEL: name: complex_cluster
; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
; PRERA-NEXT: {{ $}}
; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; DEFAULT-LABEL: name: complex_cluster
; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
; DEFAULT-NEXT: {{ $}}
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; BOTHSCHEDPASS-LABEL: name: complex_cluster
; BOTHSCHEDPASS: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $vgpr10_vgpr11
; BOTHSCHEDPASS-NEXT: {{ $}}
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
; TWOLIMIT-LABEL: name: complex_cluster
; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
; TWOLIMIT-NEXT: {{ $}}
; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; POSTRA-LABEL: name: complex_cluster
; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
; POSTRA-NEXT: {{ $}}
; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
$vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...

View File

@@ -0,0 +1,183 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-igrouplp=1 2>&1 | FileCheck -check-prefix=PIPELINE %s
---
name: no_pipeline
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0, $vgpr10_vgpr11
; DEFAULT-LABEL: name: no_pipeline
; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11
; DEFAULT-NEXT: {{ $}}
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; DEFAULT-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; PIPELINE-LABEL: name: no_pipeline
; PIPELINE: liveins: $sgpr0, $vgpr10_vgpr11
; PIPELINE-NEXT: {{ $}}
; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; PIPELINE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; PIPELINE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
; PIPELINE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
; PIPELINE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
; PIPELINE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
...
---
name: full_pipe
tracksRegLiveness: true
body: |
bb.0:
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $sgpr0, $vgpr10_vgpr11
; DEFAULT-LABEL: name: full_pipe
; DEFAULT: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11
; DEFAULT-NEXT: {{ $}}
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec
; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec
; DEFAULT-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec
; DEFAULT-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec
; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec
; DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
; DEFAULT-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
; DEFAULT-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
; DEFAULT-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
; DEFAULT-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
; DEFAULT-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
; DEFAULT-NEXT: }
; DEFAULT-NEXT: DS_WRITE_B32 $vgpr3, killed $vgpr1, 0, 16, implicit $m0, implicit $exec
; DEFAULT-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec {
; DEFAULT-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec
; DEFAULT-NEXT: }
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec
; DEFAULT-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec
; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec
; PIPELINE-LABEL: name: full_pipe
; PIPELINE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11
; PIPELINE-NEXT: {{ $}}
; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec
; PIPELINE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec
; PIPELINE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; PIPELINE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
; PIPELINE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec
; PIPELINE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec
; PIPELINE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; PIPELINE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec
; PIPELINE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec
; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
; PIPELINE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec
; PIPELINE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; PIPELINE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; PIPELINE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec
; PIPELINE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec
; PIPELINE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
; PIPELINE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec
; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
; PIPELINE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
; PIPELINE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
; PIPELINE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
; PIPELINE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
; PIPELINE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
; PIPELINE-NEXT: }
; PIPELINE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec
; PIPELINE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec {
; PIPELINE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
; PIPELINE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec
; PIPELINE-NEXT: }
; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; PIPELINE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; PIPELINE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
; PIPELINE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; PIPELINE-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit killed $vgpr3 {
; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec
; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec
; PIPELINE-NEXT: }
; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr2 = V_MOV_B32_e32 2, implicit $exec
$vgpr3 = V_MOV_B32_e32 3, implicit $exec
$vgpr4 = V_MOV_B32_e32 4, implicit $exec
$vgpr5 = V_MOV_B32_e32 5, implicit $exec
$vgpr30 = V_MOV_B32_e32 30, implicit $exec
$vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
$vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
$vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
$vgpr9 = V_MOV_B32_e32 1, implicit $exec
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr24 = V_MOV_B32_e32 1, implicit $exec
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
$vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
$vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
$vgpr26 = V_MOV_B32_e32 1, implicit $exec
$vgpr27 = V_MOV_B32_e32 1, implicit $exec
$vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr21 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
$vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
$vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec
$vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
$vgpr17 = V_MOV_B32_e32 1, implicit $exec
$vgpr18 = V_MOV_B32_e32 1, implicit $exec
$vgpr20 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
DS_WRITE_B32 $vgpr0, $vgpr7, 0, 16, implicit $m0, implicit $exec
$agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
DS_WRITE_B32 $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
DS_WRITE_B32 $vgpr9, $vgpr24, 0, 16, implicit $m0, implicit $exec
...