[AMDGPU] Instruction Type Pipeline
This patch implements a DAG mutation which adds edges between different groups of instructions. The purpose is to try to generate code that conforms to a pipeline (groupA instructions occur before groupB, groupB -> groupC, and so on). Currently the pipeline order is hardcoded as VMEM->DSRead->MFMA->DSWrite, but the patch was designed to be easily extensible. Alias analysis is problematic for pipelining as memory instructions will usually not be able to be reordered w.r.t one another. Differential Revision: https://reviews.llvm.org/D125997
This commit is contained in:
@@ -1,173 +0,0 @@
|
||||
//===--- AMDGPUMFMAClusting.cpp - AMDGPU MFMA Clustering -------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file This file contains a DAG scheduling mutation to cluster MFMA
|
||||
/// instructions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPUMFMAClustering.h"
|
||||
#include "AMDGPUTargetMachine.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "SIMachineFunctionInfo.h"
|
||||
#include "llvm/CodeGen/MachineScheduler.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-mfma-clustering"
|
||||
|
||||
namespace {
|
||||
|
||||
static cl::opt<bool> EnableMFMACluster("amdgpu-mfma-cluster",
|
||||
cl::desc("Enable MFMA clustering"),
|
||||
cl::init(false));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
MaxMFMAClusterSize("amdgpu-mfma-cluster-size", cl::init(5), cl::Hidden,
|
||||
cl::desc("The maximum number of MFMA instructions to "
|
||||
"attempt to cluster together."));
|
||||
|
||||
class MFMAClusterDAGMutation : public ScheduleDAGMutation {
|
||||
const SIInstrInfo *TII;
|
||||
ScheduleDAGMI *DAG;
|
||||
|
||||
public:
|
||||
MFMAClusterDAGMutation() = default;
|
||||
void apply(ScheduleDAGInstrs *DAGInstrs) override;
|
||||
};
|
||||
|
||||
static void collectMFMASUnits(SmallVectorImpl<SUnit *> &MFMASUnits,
|
||||
const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) {
|
||||
for (SUnit &SU : DAG->SUnits) {
|
||||
MachineInstr &MAI = *SU.getInstr();
|
||||
if (!TII->isMAI(MAI) ||
|
||||
MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
|
||||
MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
|
||||
continue;
|
||||
|
||||
MFMASUnits.push_back(&SU);
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU););
|
||||
}
|
||||
|
||||
// Sorting the MFMAs in NodeNum order results in a good clustering order
|
||||
std::sort(MFMASUnits.begin(), MFMASUnits.end(),
|
||||
[](SUnit *a, SUnit *b) { return a->NodeNum < b->NodeNum; });
|
||||
}
|
||||
|
||||
static void propagateDeps(DenseMap<unsigned, unsigned> &SUnit2ClusterInfo,
|
||||
llvm::ArrayRef<SDep> ClusterPreds,
|
||||
llvm::ArrayRef<SDep> ClusterSuccs,
|
||||
unsigned ClusterNum, ScheduleDAGInstrs *DAG) {
|
||||
|
||||
for (auto Node : SUnit2ClusterInfo) {
|
||||
if (Node.second != ClusterNum)
|
||||
continue; // Only add the combined succs to the current cluster
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Copying Deps To SU(" << Node.first << ")\n");
|
||||
|
||||
for (const SDep &Succ : ClusterSuccs) {
|
||||
LLVM_DEBUG(dbgs() << "Copying Succ SU(" << Succ.getSUnit()->NodeNum
|
||||
<< ")\n");
|
||||
DAG->addEdge(Succ.getSUnit(),
|
||||
SDep(&DAG->SUnits[Node.first], SDep::Artificial));
|
||||
}
|
||||
|
||||
for (const SDep &Pred : ClusterPreds) {
|
||||
LLVM_DEBUG(dbgs() << "Copying Pred SU(" << Pred.getSUnit()->NodeNum
|
||||
<< ")\n");
|
||||
if (Pred.getSUnit()->NodeNum == ClusterNum)
|
||||
continue;
|
||||
DAG->addEdge(&DAG->SUnits[Node.first],
|
||||
SDep(Pred.getSUnit(), SDep::Artificial));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void clusterNeighboringMFMAs(llvm::ArrayRef<SUnit *> MFMASUnits,
|
||||
ScheduleDAGInstrs *DAG) {
|
||||
|
||||
DenseMap<unsigned, unsigned> SUnit2ClusterInfo;
|
||||
|
||||
for (unsigned Idx = 0, End = MFMASUnits.size(); Idx < (End - 1); ++Idx) {
|
||||
if (SUnit2ClusterInfo.count(MFMASUnits[Idx]->NodeNum))
|
||||
continue; // We don't want to cluster against a different cluster
|
||||
|
||||
auto MFMAOpa = MFMASUnits[Idx];
|
||||
auto ClusterBase = MFMAOpa;
|
||||
unsigned ClusterNum = ClusterBase->NodeNum;
|
||||
SmallVector<SDep, 4> ClusterSuccs(MFMAOpa->Succs);
|
||||
SmallVector<SDep, 4> ClusterPreds(MFMAOpa->Preds);
|
||||
unsigned NextIdx = Idx + 1;
|
||||
unsigned ClusterSize = 1;
|
||||
|
||||
// Attempt to cluster all the remaining MFMASunits in a chain
|
||||
// starting at ClusterBase/MFMAOpa.
|
||||
for (; NextIdx < End; ++NextIdx) {
|
||||
if (ClusterSize >= MaxMFMAClusterSize || NextIdx >= End)
|
||||
break;
|
||||
// Only add independent MFMAs that have not been previously clustered
|
||||
if (SUnit2ClusterInfo.count(MFMASUnits[NextIdx]->NodeNum) ||
|
||||
DAG->IsReachable(MFMASUnits[NextIdx], ClusterBase) ||
|
||||
DAG->IsReachable(ClusterBase, MFMASUnits[NextIdx]))
|
||||
continue;
|
||||
|
||||
auto MFMAOpb = MFMASUnits[NextIdx];
|
||||
// Aggregate the cluster inst dependencies for dep propogation
|
||||
ClusterPreds.append(MFMAOpb->Preds);
|
||||
ClusterSuccs.append(MFMAOpb->Succs);
|
||||
if (!DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Cluster)))
|
||||
continue;
|
||||
|
||||
// Enforce ordering to ensure root/leaf of cluster chain gets
|
||||
// scheduled first/last
|
||||
DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Artificial));
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Cluster MFMA SU(" << MFMAOpa->NodeNum << ") - SU("
|
||||
<< MFMAOpb->NodeNum << ")\n");
|
||||
|
||||
SUnit2ClusterInfo[MFMAOpb->NodeNum] = ClusterNum;
|
||||
SUnit2ClusterInfo[MFMAOpa->NodeNum] = ClusterNum;
|
||||
++ClusterSize;
|
||||
MFMAOpa = MFMAOpb;
|
||||
}
|
||||
propagateDeps(SUnit2ClusterInfo, ClusterPreds, ClusterSuccs, ClusterNum,
|
||||
DAG);
|
||||
}
|
||||
}
|
||||
|
||||
void MFMAClusterDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
|
||||
const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
|
||||
TII = ST.getInstrInfo();
|
||||
if (!ST.hasMAIInsts())
|
||||
return;
|
||||
DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
|
||||
const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
|
||||
if (!TSchedModel || DAG->SUnits.empty())
|
||||
return;
|
||||
|
||||
SmallVector<SUnit *, 32> MFMASUnits;
|
||||
collectMFMASUnits(MFMASUnits, TII, DAG);
|
||||
|
||||
if (MFMASUnits.size() < 2)
|
||||
return;
|
||||
|
||||
clusterNeighboringMFMAs(MFMASUnits, DAG);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace llvm {
|
||||
|
||||
std::unique_ptr<ScheduleDAGMutation> createMFMAClusterDAGMutation() {
|
||||
return EnableMFMACluster ? std::make_unique<MFMAClusterDAGMutation>()
|
||||
: nullptr;
|
||||
}
|
||||
|
||||
} // end namespace llvm
|
||||
219
llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp
Normal file
219
llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp
Normal file
@@ -0,0 +1,219 @@
|
||||
//===--- AMDGPUMFMAIGroupLP.cpp - AMDGPU MFMA IGroupLP ------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// \file This file contains a DAG scheduling mutation which tries to coerce
|
||||
// the scheduler into generating an ordering based on ordering of groups
|
||||
// of instructions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPUMFMAIGroupLP.h"
|
||||
#include "AMDGPUTargetMachine.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "SIMachineFunctionInfo.h"
|
||||
#include "llvm/CodeGen/MachineScheduler.h"
|
||||
#include "llvm/CodeGen/TargetOpcodes.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-MFMA-IGroupLP"
|
||||
|
||||
namespace {
|
||||
|
||||
static cl::opt<bool>
|
||||
EnableMFMAIGroupLP("amdgpu-mfma-igrouplp",
|
||||
cl::desc("Enable construction of Instruction Groups and "
|
||||
"their ordering for scheduling"),
|
||||
cl::init(false));
|
||||
|
||||
static cl::opt<int>
|
||||
VMEMGroupMaxSize("amdgpu-mfma-igrouplp-vmem-group-size", cl::init(-1),
|
||||
cl::Hidden,
|
||||
cl::desc("The maximum number of instructions to include "
|
||||
"in VMEM group."));
|
||||
|
||||
static cl::opt<int>
|
||||
MFMAGroupMaxSize("amdgpu-mfma-igrouplp-mfma-group-size", cl::init(-1),
|
||||
cl::Hidden,
|
||||
cl::desc("The maximum number of instructions to include "
|
||||
"in MFMA group."));
|
||||
|
||||
static cl::opt<int>
|
||||
LDRGroupMaxSize("amdgpu-mfma-igrouplp-ldr-group-size", cl::init(-1),
|
||||
cl::Hidden,
|
||||
cl::desc("The maximum number of instructions to include "
|
||||
"in lds/gds read group."));
|
||||
|
||||
static cl::opt<int>
|
||||
LDWGroupMaxSize("amdgpu-mfma-igrouplp-ldw-group-size", cl::init(-1),
|
||||
cl::Hidden,
|
||||
cl::desc("The maximum number of instructions to include "
|
||||
"in lds/gds write group."));
|
||||
|
||||
typedef function_ref<bool(const MachineInstr &)> IsInstructionType;
|
||||
|
||||
struct InstructionClass {
|
||||
SmallVector<SUnit *, 32> Collection;
|
||||
const IsInstructionType isInstructionClass;
|
||||
// MaxSize is initialized to -1 by default, if MaxSize is < 0, then
|
||||
// the collection will not have a size limit
|
||||
const int MaxSize;
|
||||
|
||||
InstructionClass(IsInstructionType IsInstructionClass, int maxSize)
|
||||
: isInstructionClass(IsInstructionClass), MaxSize(maxSize){};
|
||||
|
||||
bool IsFull() { return !(MaxSize <= 0) && (int)Collection.size() >= MaxSize; }
|
||||
};
|
||||
|
||||
class MFMAIGroupLPDAGMutation : public ScheduleDAGMutation {
|
||||
public:
|
||||
const SIInstrInfo *TII;
|
||||
ScheduleDAGMI *DAG;
|
||||
|
||||
MFMAIGroupLPDAGMutation() = default;
|
||||
void apply(ScheduleDAGInstrs *DAGInstrs) override;
|
||||
};
|
||||
|
||||
static void collectSUnits(SmallVectorImpl<InstructionClass *> &PipelineOrder,
|
||||
const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) {
|
||||
for (SUnit &SU : DAG->SUnits) {
|
||||
LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
|
||||
|
||||
// Presently, a bundle only counts as one instruction towards
|
||||
// the group's maximum size
|
||||
if (SU.getInstr()->getOpcode() == TargetOpcode::BUNDLE) {
|
||||
MachineInstr *MI = SU.getInstr();
|
||||
MachineBasicBlock::instr_iterator BundledMI = MI->getIterator();
|
||||
++BundledMI;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Checking bundled insts\n";);
|
||||
|
||||
InstructionClass *MatchingStage = nullptr;
|
||||
for (auto Stage : PipelineOrder) {
|
||||
if (Stage->isInstructionClass(*BundledMI) && !Stage->IsFull()) {
|
||||
MatchingStage = Stage;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (MatchingStage != nullptr) {
|
||||
while (MatchingStage->isInstructionClass(*BundledMI)) {
|
||||
if (!BundledMI->isBundledWithSucc())
|
||||
break;
|
||||
++BundledMI;
|
||||
}
|
||||
|
||||
if (!BundledMI->isBundledWithSucc()) {
|
||||
LLVM_DEBUG(dbgs() << "Bundle is all of same type\n";);
|
||||
MatchingStage->Collection.push_back(&SU);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (InstructionClass *Stage : PipelineOrder) {
|
||||
if (Stage->isInstructionClass(*SU.getInstr()) && !Stage->IsFull()) {
|
||||
Stage->Collection.push_back(&SU);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
addPipelineEdges(const llvm::ArrayRef<InstructionClass *> PipelineOrder,
|
||||
ScheduleDAGInstrs *DAG) {
|
||||
for (int i = 0; i < (int)PipelineOrder.size() - 1; i++) {
|
||||
auto StageA = PipelineOrder[i];
|
||||
for (int j = i + 1; j < (int)PipelineOrder.size(); j++) {
|
||||
auto StageB = PipelineOrder[j];
|
||||
for (auto SUnitA : StageA->Collection) {
|
||||
LLVM_DEBUG(dbgs() << "Adding edges for: "; DAG->dumpNode(*SUnitA););
|
||||
for (auto SUnitB : StageB->Collection) {
|
||||
if (DAG->canAddEdge(SUnitB, SUnitA)) {
|
||||
DAG->addEdge(SUnitB, SDep(SUnitA, SDep::Artificial));
|
||||
LLVM_DEBUG(dbgs() << "Added edge to: "; DAG->dumpNode(*SUnitB););
|
||||
} else {
|
||||
LLVM_DEBUG(dbgs() << "Can't add edge to: ";
|
||||
DAG->dumpNode(*SUnitB););
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MFMAIGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
|
||||
const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
|
||||
TII = ST.getInstrInfo();
|
||||
if (!ST.hasMAIInsts())
|
||||
return;
|
||||
DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
|
||||
const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
|
||||
if (!TSchedModel || DAG->SUnits.empty())
|
||||
return;
|
||||
|
||||
const IsInstructionType isMFMAFn = [this](const MachineInstr &MI) {
|
||||
if (TII->isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
|
||||
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) {
|
||||
LLVM_DEBUG(dbgs() << "Found MFMA\n";);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
InstructionClass MFMASUnits(isMFMAFn, MFMAGroupMaxSize);
|
||||
|
||||
const IsInstructionType isVMEMReadFn = [this](const MachineInstr &MI) {
|
||||
if (((TII->isFLAT(MI) && !TII->isDS(MI)) || TII->isVMEM(MI)) &&
|
||||
MI.mayLoad()) {
|
||||
LLVM_DEBUG(dbgs() << "Found VMEM read\n";);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
InstructionClass VMEMReadSUnits(isVMEMReadFn, VMEMGroupMaxSize);
|
||||
|
||||
const IsInstructionType isDSWriteFn = [this](const MachineInstr &MI) {
|
||||
if (TII->isDS(MI) && MI.mayStore()) {
|
||||
LLVM_DEBUG(dbgs() << "Found DS Write\n";);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
InstructionClass DSWriteSUnits(isDSWriteFn, LDWGroupMaxSize);
|
||||
|
||||
const IsInstructionType isDSReadFn = [this](const MachineInstr &MI) {
|
||||
if (TII->isDS(MI) && MI.mayLoad()) {
|
||||
LLVM_DEBUG(dbgs() << "Found DS Read\n";);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
InstructionClass DSReadSUnits(isDSReadFn, LDRGroupMaxSize);
|
||||
|
||||
// The order of InstructionClasses in this vector defines the
|
||||
// order in which edges will be added. In other words, given the
|
||||
// present ordering, we will try to make each VMEMRead instruction
|
||||
// a predecessor of each DSRead instruction, and so on.
|
||||
SmallVector<InstructionClass *, 4> PipelineOrder = {
|
||||
&VMEMReadSUnits, &DSReadSUnits, &MFMASUnits, &DSWriteSUnits};
|
||||
|
||||
collectSUnits(PipelineOrder, TII, DAG);
|
||||
|
||||
addPipelineEdges(PipelineOrder, DAG);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace llvm {
|
||||
|
||||
std::unique_ptr<ScheduleDAGMutation> createMFMAIGroupLPDAGMutation() {
|
||||
return EnableMFMAIGroupLP ? std::make_unique<MFMAIGroupLPDAGMutation>()
|
||||
: nullptr;
|
||||
}
|
||||
|
||||
} // end namespace llvm
|
||||
@@ -1,4 +1,4 @@
|
||||
//===- AMDGPUMFMAClustering.h - AMDGPU MFMA Clustering ------*- C++ -*-===//
|
||||
//===- AMDGPUMFMAIGroupLP.h - AMDGPU MFMA IGroupLP --------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
@@ -6,16 +6,16 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
|
||||
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
|
||||
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
|
||||
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
|
||||
|
||||
#include "llvm/CodeGen/ScheduleDAGMutation.h"
|
||||
#include <memory>
|
||||
|
||||
namespace llvm {
|
||||
|
||||
std::unique_ptr<ScheduleDAGMutation> createMFMAClusterDAGMutation();
|
||||
std::unique_ptr<ScheduleDAGMutation> createMFMAIGroupLPDAGMutation();
|
||||
|
||||
} // namespace llvm
|
||||
|
||||
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
|
||||
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
|
||||
@@ -16,7 +16,7 @@
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPUAliasAnalysis.h"
|
||||
#include "AMDGPUExportClustering.h"
|
||||
#include "AMDGPUMFMAClustering.h"
|
||||
#include "AMDGPUMFMAIGroupLP.h"
|
||||
#include "AMDGPUMacroFusion.h"
|
||||
#include "AMDGPUTargetObjectFile.h"
|
||||
#include "AMDGPUTargetTransformInfo.h"
|
||||
@@ -399,7 +399,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
|
||||
ScheduleDAGMILive *DAG =
|
||||
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
|
||||
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
DAG->addMutation(createMFMAClusterDAGMutation());
|
||||
DAG->addMutation(createMFMAIGroupLPDAGMutation());
|
||||
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
|
||||
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
|
||||
return DAG;
|
||||
@@ -881,7 +881,7 @@ public:
|
||||
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
|
||||
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
|
||||
DAG->addMutation(createMFMAClusterDAGMutation());
|
||||
DAG->addMutation(createMFMAIGroupLPDAGMutation());
|
||||
return DAG;
|
||||
}
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ add_llvm_target(AMDGPUCodeGen
|
||||
AMDGPUMachineModuleInfo.cpp
|
||||
AMDGPUMacroFusion.cpp
|
||||
AMDGPUMCInstLower.cpp
|
||||
AMDGPUMFMAClustering.cpp
|
||||
AMDGPUMFMAIGroupLP.cpp
|
||||
AMDGPUMIRFormatter.cpp
|
||||
AMDGPUOpenCLEnqueuedBlockLowering.cpp
|
||||
AMDGPUPerfHintAnalysis.cpp
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1 | FileCheck -check-prefix=PRERA %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 --debug-only=amdgpu-mfma-clustering 2>&1 | FileCheck -check-prefix=TWOLIMIT %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1| FileCheck -check-prefix=POSTRA %s
|
||||
# REQUIRES: asserts
|
||||
|
||||
# PRERA: Cluster MFMA SU(2) - SU(6)
|
||||
# PRERA-NEXT: Cluster MFMA SU(6) - SU(10)
|
||||
# PRERA-NEXT: Cluster MFMA SU(10) - SU(12)
|
||||
|
||||
# TWOLIMIT: Cluster MFMA SU(2) - SU(6)
|
||||
# TWOLIMIT: Cluster MFMA SU(10) - SU(11)
|
||||
|
||||
# POSTRA: Cluster MFMA SU(2) - SU(6)
|
||||
# POSTRA-NEXT: Cluster MFMA SU(6) - SU(10)
|
||||
# POSTRA-NEXT: Cluster MFMA SU(10) - SU(12)
|
||||
|
||||
---
|
||||
name: basic_cluster
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
|
||||
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# PRERA: Cluster MFMA SU(12) - SU(16)
|
||||
# PRERA-NEXT: Cluster MFMA SU(16) - SU(20)
|
||||
|
||||
# POSTRA: Cluster MFMA SU(12) - SU(16)
|
||||
# POSTRA-NEXT: Cluster MFMA SU(16) - SU(20)
|
||||
|
||||
---
|
||||
name: complex_cluster
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
|
||||
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
|
||||
$vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
...
|
||||
@@ -1,354 +0,0 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=PRERA %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=BOTHSCHEDPASS %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 2>&1 | FileCheck -check-prefix=TWOLIMIT %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1| FileCheck -check-prefix=POSTRA %s
|
||||
|
||||
|
||||
---
|
||||
name: no_cluster
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0, $vgpr10_vgpr11
|
||||
; PRERA-LABEL: name: no_cluster
|
||||
; PRERA: liveins: $sgpr0, $vgpr10_vgpr11
|
||||
; PRERA-NEXT: {{ $}}
|
||||
; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; PRERA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
; DEFAULT-LABEL: name: no_cluster
|
||||
; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11
|
||||
; DEFAULT-NEXT: {{ $}}
|
||||
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
; BOTHSCHEDPASS-LABEL: name: no_cluster
|
||||
; BOTHSCHEDPASS: liveins: $sgpr0, $vgpr10_vgpr11
|
||||
; BOTHSCHEDPASS-NEXT: {{ $}}
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
|
||||
; TWOLIMIT-LABEL: name: no_cluster
|
||||
; TWOLIMIT: liveins: $sgpr0, $vgpr10_vgpr11
|
||||
; TWOLIMIT-NEXT: {{ $}}
|
||||
; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
; POSTRA-LABEL: name: no_cluster
|
||||
; POSTRA: liveins: $sgpr0, $vgpr10_vgpr11
|
||||
; POSTRA-NEXT: {{ $}}
|
||||
; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
|
||||
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
|
||||
$vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
$vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
|
||||
---
|
||||
name: basic_cluster
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
|
||||
; PRERA-LABEL: name: basic_cluster
|
||||
; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
|
||||
; PRERA-NEXT: {{ $}}
|
||||
; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-LABEL: name: basic_cluster
|
||||
; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
|
||||
; DEFAULT-NEXT: {{ $}}
|
||||
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-LABEL: name: basic_cluster
|
||||
; BOTHSCHEDPASS: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
|
||||
; BOTHSCHEDPASS-NEXT: {{ $}}
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-LABEL: name: basic_cluster
|
||||
; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
|
||||
; TWOLIMIT-NEXT: {{ $}}
|
||||
; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-LABEL: name: basic_cluster
|
||||
; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
|
||||
; POSTRA-NEXT: {{ $}}
|
||||
; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
|
||||
---
|
||||
name: complex_cluster
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
|
||||
; PRERA-LABEL: name: complex_cluster
|
||||
; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
|
||||
; PRERA-NEXT: {{ $}}
|
||||
; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; DEFAULT-LABEL: name: complex_cluster
|
||||
; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
|
||||
; DEFAULT-NEXT: {{ $}}
|
||||
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; BOTHSCHEDPASS-LABEL: name: complex_cluster
|
||||
; BOTHSCHEDPASS: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $vgpr10_vgpr11
|
||||
; BOTHSCHEDPASS-NEXT: {{ $}}
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
|
||||
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
|
||||
; TWOLIMIT-LABEL: name: complex_cluster
|
||||
; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
|
||||
; TWOLIMIT-NEXT: {{ $}}
|
||||
; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; POSTRA-LABEL: name: complex_cluster
|
||||
; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
|
||||
; POSTRA-NEXT: {{ $}}
|
||||
; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
|
||||
; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
|
||||
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
|
||||
$vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
|
||||
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
...
|
||||
183
llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir
Normal file
183
llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir
Normal file
@@ -0,0 +1,183 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-igrouplp=1 2>&1 | FileCheck -check-prefix=PIPELINE %s
|
||||
|
||||
---
|
||||
name: no_pipeline
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0, $vgpr10_vgpr11
|
||||
; DEFAULT-LABEL: name: no_pipeline
|
||||
; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11
|
||||
; DEFAULT-NEXT: {{ $}}
|
||||
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
; PIPELINE-LABEL: name: no_pipeline
|
||||
; PIPELINE: liveins: $sgpr0, $vgpr10_vgpr11
|
||||
; PIPELINE-NEXT: {{ $}}
|
||||
; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; PIPELINE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
|
||||
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
|
||||
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
|
||||
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
...
|
||||
|
||||
|
||||
---
|
||||
name: full_pipe
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $sgpr0, $vgpr10_vgpr11
|
||||
; DEFAULT-LABEL: name: full_pipe
|
||||
; DEFAULT: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11
|
||||
; DEFAULT-NEXT: {{ $}}
|
||||
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec
|
||||
; DEFAULT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
|
||||
; DEFAULT-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
|
||||
; DEFAULT-NEXT: }
|
||||
; DEFAULT-NEXT: DS_WRITE_B32 $vgpr3, killed $vgpr1, 0, 16, implicit $m0, implicit $exec
|
||||
; DEFAULT-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec {
|
||||
; DEFAULT-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: }
|
||||
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec
|
||||
; DEFAULT-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec
|
||||
; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec
|
||||
; PIPELINE-LABEL: name: full_pipe
|
||||
; PIPELINE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11
|
||||
; PIPELINE-NEXT: {{ $}}
|
||||
; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec
|
||||
; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
|
||||
; PIPELINE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
|
||||
; PIPELINE-NEXT: }
|
||||
; PIPELINE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec
|
||||
; PIPELINE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec {
|
||||
; PIPELINE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
|
||||
; PIPELINE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec
|
||||
; PIPELINE-NEXT: }
|
||||
; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PIPELINE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PIPELINE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PIPELINE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; PIPELINE-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit killed $vgpr3 {
|
||||
; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec
|
||||
; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec
|
||||
; PIPELINE-NEXT: }
|
||||
; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr2 = V_MOV_B32_e32 2, implicit $exec
|
||||
$vgpr3 = V_MOV_B32_e32 3, implicit $exec
|
||||
$vgpr4 = V_MOV_B32_e32 4, implicit $exec
|
||||
$vgpr5 = V_MOV_B32_e32 5, implicit $exec
|
||||
$vgpr30 = V_MOV_B32_e32 30, implicit $exec
|
||||
$vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
$vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
$vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
|
||||
$vgpr9 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
|
||||
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
$vgpr24 = V_MOV_B32_e32 1, implicit $exec
|
||||
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
|
||||
$vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
|
||||
$vgpr26 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr27 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
|
||||
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
|
||||
$vgpr21 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
|
||||
$vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
|
||||
$vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
|
||||
DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec
|
||||
$vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
|
||||
$vgpr17 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr18 = V_MOV_B32_e32 1, implicit $exec
|
||||
$vgpr20 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
|
||||
DS_WRITE_B32 $vgpr0, $vgpr7, 0, 16, implicit $m0, implicit $exec
|
||||
$agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
|
||||
DS_WRITE_B32 $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec
|
||||
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
DS_WRITE_B32 $vgpr9, $vgpr24, 0, 16, implicit $m0, implicit $exec
|
||||
...
|
||||
Reference in New Issue
Block a user