From 16cda01d22c0ac1713f667d501bdca91594a4e13 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Thu, 5 Sep 2024 14:39:28 +0900 Subject: [PATCH] [AMDGPU] V_SET_INACTIVE optimizations (#98864) Optimize V_SET_INACTIVE by allow it to run in WWM. Hence WWM sections are not broken up for inactive lane setting. WWM V_SET_INACTIVE can typically be lower to V_CNDMASK. Some cases require use of exec manipulation V_MOV as previous code. GFX9 sees slight instruction count increase in edge cases due to smaller constant bus. Additionally avoid introducing exec manipulation and V_MOVs where a source of V_SET_INACTIVE is the destination. This is a common pattern as WWM register pre-allocation often assigns the same register. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 187 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 + llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 89 +- .../GlobalISel/llvm.amdgcn.set.inactive.ll | 405 ++-- .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 30 +- .../AMDGPU/amdgpu-cs-chain-preserve-cc.ll | 29 +- .../atomic_optimizations_global_pointer.ll | 552 ++--- .../atomic_optimizations_local_pointer.ll | 1917 ++++++----------- .../atomic_optimizations_pixelshader.ll | 64 +- llvm/test/CodeGen/AMDGPU/cse-convergent.ll | 14 +- llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll | 18 +- .../AMDGPU/global_atomics_scan_fadd.ll | 607 ++---- .../AMDGPU/global_atomics_scan_fmax.ll | 440 ++-- .../AMDGPU/global_atomics_scan_fmin.ll | 440 ++-- .../AMDGPU/global_atomics_scan_fsub.ll | 607 ++---- .../llvm.amdgcn.set.inactive.chain.arg.ll | 389 ++-- .../AMDGPU/llvm.amdgcn.set.inactive.ll | 382 ++-- .../AMDGPU/set-inactive-wwm-overwrite.ll | 12 +- .../AMDGPU/should-not-hoist-set-inactive.ll | 5 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 64 +- llvm/test/CodeGen/AMDGPU/wqm.ll | 52 +- llvm/test/CodeGen/AMDGPU/wqm.mir | 25 +- .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 706 +++--- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 516 +++-- .../MIR/AMDGPU/machine-function-info.ll | 7 +- 25 files changed, 3243 insertions(+), 4316 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a857bdba53c3..844f62abc267 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2098,8 +2098,22 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { } } +Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) { + assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || + MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64); + for (auto &Op : MI.implicit_operands()) { + if (Op.isDef()) + continue; + Register OpReg = Op.getReg(); + if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO || + OpReg == AMDGPU::SCC) + continue; + return OpReg; + } + return Register(); +} + bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { - const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { @@ -2273,37 +2287,147 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } - case AMDGPU::V_SET_INACTIVE_B32: { - unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - // FIXME: We may possibly optimize the COPY once we find ways to make LLVM - // optimizations (mainly Register Coalescer) aware of WWM register liveness. - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) - .add(MI.getOperand(1)); - auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); - FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) - .add(MI.getOperand(2)); - BuildMI(MBB, MI, DL, get(NotOpc), Exec) - .addReg(Exec); - MI.eraseFromParent(); - break; - } + case AMDGPU::V_SET_INACTIVE_B32: case AMDGPU::V_SET_INACTIVE_B64: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), - MI.getOperand(0).getReg()) - .add(MI.getOperand(1)); - expandPostRAPseudo(*Copy); - auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); - FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten - Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), - MI.getOperand(0).getReg()) - .add(MI.getOperand(2)); - expandPostRAPseudo(*Copy); - BuildMI(MBB, MI, DL, get(NotOpc), Exec) - .addReg(Exec); + unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64 + ? AMDGPU::V_MOV_B64_PSEUDO + : AMDGPU::V_MOV_B32_e32; + Register ExecReg = RI.getExec(); + Register DstReg = MI.getOperand(0).getReg(); + MachineOperand &ActiveSrc = MI.getOperand(1); + MachineOperand &InactiveSrc = MI.getOperand(2); + + // Find implicit register defining lanes active outside WWM. + Register ExecSrcReg = findSetInactiveMask(MI); + assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region"); + // Note: default here is set to ExecReg so that functional MIR is still + // generated if implicit def is not found and assertions are disabled. + if (!ExecSrcReg) + ExecSrcReg = ExecReg; + + // Ideally in WWM this operation is lowered to V_CNDMASK; however, + // constant bus constraints and the presence of literal constants + // present an issue. + // Fallback to V_MOV base lowering in all but the common cases. + const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32; + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64; + const MCInstrDesc &Desc = get(Opcode); + + const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0); + const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0); + const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue()); + const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue()); + const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue()); + const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue()); + + int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + + int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64); + int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; + int ConstantBusUses = + 1 + // Starts at 1 for ExecSrcReg + (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) + + (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0); + int LiteralConstants = + ((ActiveSrc.isReg() || + (ActiveSrc.isImm() && isInlineConstant(ActiveImm))) + ? 0 + : 1) + + ((InactiveSrc.isReg() || + (InactiveSrc.isImm() && isInlineConstant(InactiveImm))) + ? 0 + : 1); + + bool UseVCndMask = + ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit; + if (VMov64 && UseVCndMask) { + // Decomposition must not introduce new literals. + UseVCndMask &= + ActiveSrc.isReg() || + (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) || + (!isInlineConstant(ActiveImm)); + UseVCndMask &= InactiveSrc.isReg() || + (isInlineConstant(InactiveImmLo) && + isInlineConstant(InactiveImmHi)) || + (!isInlineConstant(InactiveImm)); + } + + if (UseVCndMask && VMov64) { + // Dual V_CNDMASK_B32 + MachineOperand ActiveLo = buildExtractSubRegOrImm( + MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr); + MachineOperand ActiveHi = buildExtractSubRegOrImm( + MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr); + MachineOperand InactiveLo = buildExtractSubRegOrImm( + MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr); + MachineOperand InactiveHi = buildExtractSubRegOrImm( + MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr); + if (ActiveSrc.isReg()) + ActiveHi.setIsKill(ActiveSrc.isKill()); + if (InactiveSrc.isReg()) + InactiveHi.setIsKill(InactiveSrc.isKill()); + BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0)) + .addImm(0) + .add(InactiveLo) + .addImm(0) + .add(ActiveLo) + .addReg(ExecSrcReg) + .addReg(DstReg, RegState::ImplicitDefine); + BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1)) + .addImm(0) + .add(InactiveHi) + .addImm(0) + .add(ActiveHi) + .addReg(ExecSrcReg) + .addReg(DstReg, RegState::ImplicitDefine); + } else if (UseVCndMask) { + // Single V_CNDMASK_B32 + BuildMI(MBB, MI, DL, Desc, DstReg) + .addImm(0) + .add(InactiveSrc) + .addImm(0) + .add(ActiveSrc) + .addReg(ExecSrcReg); + } else { + // Fallback V_MOV case. + // Avoid unnecessary work if a source VGPR is also the destination. + // This can happen if WWM register allocation was efficient. + // Note: this assumes WWM execution. + bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg; + bool DstIsInactive = + InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg; + if (!DstIsInactive) { + // Set exec mask to inactive lanes, + // but only if active lanes would be overwritten. + if (DstIsActive) { + BuildMI(MBB, MI, DL, get(NotOpc), ExecReg) + .addReg(ExecSrcReg) + .setOperandDead(3); // Dead scc + } + // Copy inactive lanes + MachineInstr *VMov = + BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc); + if (VMov64) + expandPostRAPseudo(*VMov); + } + if (!DstIsActive) { + // Set exec mask to active lanes + BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg); + // Copy active lanes + MachineInstr *VMov = + BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg()) + .add(ActiveSrc); + if (VMov64) + expandPostRAPseudo(*VMov); + } + // Restore WWM + BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1); + } MI.eraseFromParent(); break; } @@ -5647,6 +5771,9 @@ unsigned SIInstrInfo::buildExtractSubReg( MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const { + if (!SuperReg.getReg().isVirtual()) + return RI.getSubReg(SuperReg.getReg(), SubIdx); + MachineBasicBlock *MBB = MI->getParent(); DebugLoc DL = MI->getDebugLoc(); Register SubReg = MRI.createVirtualRegister(SubRC); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 4fd9b4366159..71432510fdee 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1437,6 +1437,8 @@ public: // This is used if an operand is a 32 bit register but needs to be aligned // regardless. void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const; + + static Register findSetInactiveMask(const MachineInstr &MI); }; /// \brief Returns true if a reg:subreg pair P has a TRC class diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 9a51cbbb9f6b..bc4b1936cb7e 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -128,6 +128,7 @@ struct InstrInfo { char Needs = 0; char Disabled = 0; char OutNeeds = 0; + char MarkedStates = 0; }; struct BlockInfo { @@ -175,9 +176,10 @@ private: SmallVector LiveMaskQueries; SmallVector LowerToMovInstrs; - SmallVector LowerToCopyInstrs; + SmallSetVector LowerToCopyInstrs; SmallVector KillInstrs; SmallVector InitExecInstrs; + SmallVector SetInactiveInstrs; void printInfo(); @@ -295,6 +297,9 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, assert(!(Flag & StateExact) && Flag != 0); + // Capture all states requested in marking including disabled ones. + II.MarkedStates |= Flag; + // Remove any disabled states from the flag. The user that required it gets // an undefined value in the helper lanes. For example, this can happen if // the result of an atomic is used by instruction that requires WQM, where @@ -478,7 +483,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, std::vector &Worklist) { char GlobalFlags = 0; bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); - SmallVector SetInactiveInstrs; SmallVector SoftWQMInstrs; bool HasImplicitDerivatives = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; @@ -512,9 +516,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // The WQM intrinsic requires its output to have all the helper lanes // correct, so we need it to be in WQM. Flags = StateWQM; - LowerToCopyInstrs.push_back(&MI); + LowerToCopyInstrs.insert(&MI); } else if (Opcode == AMDGPU::SOFT_WQM) { - LowerToCopyInstrs.push_back(&MI); + LowerToCopyInstrs.insert(&MI); SoftWQMInstrs.push_back(&MI); } else if (Opcode == AMDGPU::STRICT_WWM) { // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus @@ -555,16 +559,24 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, GlobalFlags |= StateStrictWQM; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { - III.Disabled = StateStrict; - MachineOperand &Inactive = MI.getOperand(2); - if (Inactive.isReg()) { - if (Inactive.isUndef()) { - LowerToCopyInstrs.push_back(&MI); - } else { - markOperand(MI, Inactive, StateStrictWWM, Worklist); + // Ignore these if V_SET_INACTIVE which already has exec src register. + // These are generated by an earlier pass which has seperately ensured + // WWM and provided a mask of inactive lanes. + Register ExecSrc = TII->findSetInactiveMask(MI); + if (!ExecSrc) { + // Disable strict states; StrictWQM will be added as required later. + III.Disabled = StateStrict; + MachineOperand &Inactive = MI.getOperand(2); + if (Inactive.isReg()) { + if (Inactive.isUndef()) { + LowerToCopyInstrs.insert(&MI); + } else { + markOperand(MI, Inactive, StateStrictWWM, Worklist); + } } + SetInactiveInstrs.push_back(&MI); + BBI.NeedsLowering = true; } - SetInactiveInstrs.push_back(&MI); } else if (TII->isDisableWQM(MI)) { BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { @@ -1042,6 +1054,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n"); SmallVector SplitPoints; + Register ActiveLanesReg = 0; char State = BI.InitialState; for (MachineInstr &MI : llvm::make_early_inc_range( @@ -1058,6 +1071,21 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: SplitPoint = lowerKillF32(MBB, MI); break; + case AMDGPU::ENTER_STRICT_WWM: + ActiveLanesReg = MI.getOperand(0).getReg(); + break; + case AMDGPU::EXIT_STRICT_WWM: + ActiveLanesReg = 0; + break; + case AMDGPU::V_SET_INACTIVE_B32: + case AMDGPU::V_SET_INACTIVE_B64: + if (ActiveLanesReg) { + MI.addOperand(*MBB.getParent(), + MachineOperand::CreateReg(ActiveLanesReg, false, true)); + } else { + assert(State == StateExact || State == StateWQM); + } + break; default: break; } @@ -1497,13 +1525,14 @@ bool SIWholeQuadMode::lowerCopyInstrs() { } } for (MachineInstr *MI : LowerToCopyInstrs) { + LLVM_DEBUG(dbgs() << "simplify: " << *MI); + + Register RecomputeReg = 0; if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { assert(MI->getNumExplicitOperands() == 3); - // the only reason we should be here is V_SET_INACTIVE has - // an undef input so it is being replaced by a simple copy. - // There should be a second undef source that we should remove. - assert(MI->getOperand(2).isUndef()); + if (MI->getOperand(2).isReg()) + RecomputeReg = MI->getOperand(2).getReg(); MI->removeOperand(2); MI->untieRegOperand(1); } else { @@ -1514,7 +1543,19 @@ bool SIWholeQuadMode::lowerCopyInstrs() { ? (unsigned)AMDGPU::COPY : TII->getMovOpcode(TRI->getRegClassForOperandReg( *MRI, MI->getOperand(0))); + int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); + while (Index >= 0) { + MI->removeOperand(Index); + Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); + } + MI->setDesc(TII->get(CopyOp)); + LLVM_DEBUG(dbgs() << " -> " << *MI); + + if (RecomputeReg) { + LIS->removeInterval(RecomputeReg); + LIS->createAndComputeVirtRegInterval(RecomputeReg); + } } return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty(); } @@ -1656,6 +1697,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LowerToMovInstrs.clear(); KillInstrs.clear(); InitExecInstrs.clear(); + SetInactiveInstrs.clear(); StateTransition.clear(); ST = &MF.getSubtarget(); @@ -1712,6 +1754,21 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { Changed = true; } + // Check if V_SET_INACTIVE was touched by a strict state mode. + // If so, promote to WWM; otherwise lower to COPY. + for (MachineInstr *MI : SetInactiveInstrs) { + if (LowerToCopyInstrs.contains(MI)) + continue; + if (Instructions[MI].MarkedStates & StateStrict) { + Instructions[MI].Needs |= StateStrictWWM; + Instructions[MI].Disabled &= ~StateStrictWWM; + Blocks[MI->getParent()].Needs |= StateStrictWWM; + } else { + LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI); + LowerToCopyInstrs.insert(MI); + } + } + LLVM_DEBUG(printInfo()); Changed |= lowerLiveMaskQueries(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 8f88aaedf7e9..137366a45cbd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,18 +4,39 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) + store i32 %tmp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { +; GCN-LABEL: set_inactive_imm_poison: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) store i32 %tmp, ptr addrspace(1) %out ret void } @@ -24,18 +45,42 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) + store i64 %tmp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { +; GCN-LABEL: set_inactive_imm_poison_64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, v1 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) store i64 %tmp, ptr addrspace(1) %out ret void } @@ -45,39 +90,43 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 -; GCN-NEXT: s_load_dword s5, s[2:3], 0x2c +; GCN-NEXT: s_buffer_load_dword s6, s[4:7], 0x0 +; GCN-NEXT: s_load_dword s7, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 56 -; GCN-NEXT: s_cselect_b32 s3, 1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s6, 56 +; GCN-NEXT: s_cselect_b32 s3, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 42 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_cmp_lg_u32 s3, 0 -; GCN-NEXT: s_cbranch_scc0 .LBB2_2 +; GCN-NEXT: s_cbranch_scc0 .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %.one -; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: .LBB2_2: ; %Flow +; GCN-NEXT: .LBB4_2: ; %Flow ; GCN-NEXT: s_xor_b32 s2, s2, 1 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cbranch_scc1 .LBB2_4 +; GCN-NEXT: s_cbranch_scc1 .LBB4_4 ; GCN-NEXT: ; %bb.3: ; %.zero ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: .LBB2_4: ; %.exit +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: .LBB4_4: ; %.exit ; GCN-NEXT: s_endpgm %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) %cmp = icmp eq i32 %val, 56 - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) br i1 %cmp, label %.zero, label %.one .zero: @@ -96,19 +145,22 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0 + %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0 + %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0) store float %tmp, ptr addrspace(1) %out ret void } @@ -117,20 +169,23 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v2, 0xcccccccd -; GCN-NEXT: v_mov_b32_e32 v3, 0x4010cccc +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0xcccccccd +; GCN-NEXT: v_mov_b32_e32 v1, 0x4010cccc +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0 + %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0 + %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0) store double %tmp, ptr addrspace(1) %out ret void } @@ -138,19 +193,22 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 0x10001 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> ) #0 + %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> ) #0 + %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0) store <2 x i16> %tmp, ptr addrspace(1) %out ret void } @@ -158,19 +216,22 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> ) #0 + %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> ) #0 + %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0) store <2 x half> %tmp, ptr addrspace(1) %out ret void } @@ -179,22 +240,25 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s4, 1 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s6, 1 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> ) #0 + %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> ) #0 + %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0) store <2 x i32> %tmp, ptr addrspace(1) %out ret void } @@ -203,22 +267,25 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s4, 1.0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s6, 1.0 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> ) #0 + %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> ) #0 + %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0) store <2 x float> %tmp, ptr addrspace(1) %out ret void } @@ -226,19 +293,22 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> ) #0 + %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> ) #0 + %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0) store <2 x bfloat> %tmp, ptr addrspace(1) %out ret void } @@ -247,22 +317,25 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x10001 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s6, 0x10001 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 + %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 + %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0) store <4 x i16> %tmp, ptr addrspace(1) %out ret void } @@ -271,22 +344,25 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s6, 0x3c003c00 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 + %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 + %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0) store <4 x half> %tmp, ptr addrspace(1) %out ret void } @@ -295,22 +371,25 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x3f803f80 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s6, 0x3f803f80 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 + %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 + %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0) store <4 x bfloat> %tmp, ptr addrspace(1) %out ret void } @@ -319,18 +398,23 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0 + %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0 + %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0) store ptr %tmp, ptr addrspace(1) %out ret void } @@ -338,18 +422,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0 + %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0 + %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0) store ptr addrspace(2) %tmp, ptr addrspace(1) %out ret void } @@ -357,18 +445,22 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0 + %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0 + %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0) store ptr addrspace(3) %tmp, ptr addrspace(1) %out ret void } @@ -376,18 +468,22 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0 + %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0 + %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0) store ptr addrspace(5) %tmp, ptr addrspace(1) %out ret void } @@ -395,24 +491,31 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0 + %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0 + %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0) store ptr addrspace(6) %tmp, ptr addrspace(1) %out ret void } declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1 +declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) attributes #0 = { convergent readnone } +attributes #1 = { convergent nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll index c92b78cd4557..e34ae52fc673 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -571,11 +571,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 -; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 -; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop @@ -591,10 +590,9 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 -; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 -; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, s0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop @@ -609,11 +607,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b ; DAGISEL-GFX11-LABEL: chain_to_chain_wwm: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 -; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 -; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 -; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART @@ -629,11 +626,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b ; DAGISEL-GFX10-LABEL: chain_to_chain_wwm: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 -; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 -; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 -; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll index 8d9ed9bb4343..320268564f4d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll @@ -329,10 +329,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 -; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 -; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND @@ -351,10 +351,9 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 -; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 -; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, s0 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND @@ -371,11 +370,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 -; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 -; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 -; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND @@ -393,11 +391,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 -; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 -; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 -; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index cc7050d08541..5a8df7b84bf2 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1147,11 +1147,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1200,11 +1198,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1247,13 +1243,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_DPP-LABEL: add_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1310,11 +1303,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_DPP-LABEL: add_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1364,27 +1354,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-LABEL: add_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -1438,35 +1425,33 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-LABEL: add_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 @@ -1500,27 +1485,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-LABEL: add_i32_varying: ; GFX1264_DPP: ; %bb.0: ; %entry ; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -1578,33 +1560,30 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-LABEL: add_i32_varying: ; GFX1232_DPP: ; %bb.0: ; %entry ; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 @@ -2918,15 +2897,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3018,15 +2991,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3115,47 +3082,41 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 @@ -3228,40 +3189,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -3323,21 +3278,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 @@ -3441,22 +3390,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -3537,56 +3481,50 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 @@ -3658,35 +3596,29 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4894,11 +4826,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4947,11 +4877,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4994,13 +4922,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_DPP-LABEL: sub_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5057,11 +4982,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_DPP-LABEL: sub_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5111,27 +5033,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-LABEL: sub_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -5185,35 +5104,33 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-LABEL: sub_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 @@ -5247,27 +5164,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-LABEL: sub_i32_varying: ; GFX1264_DPP: ; %bb.0: ; %entry ; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -5325,33 +5239,30 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-LABEL: sub_i32_varying: ; GFX1232_DPP: ; %bb.0: ; %entry ; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 @@ -6707,15 +6618,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6807,15 +6712,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6904,47 +6803,41 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 @@ -7017,40 +6910,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -7112,21 +6999,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 @@ -7230,22 +7111,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7326,56 +7202,50 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 @@ -7447,35 +7317,29 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 6d0e0cc7869b..6bf03a202c14 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -780,14 +780,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -827,14 +825,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -870,13 +866,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: add_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -925,13 +918,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: add_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -971,33 +961,30 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: add_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -1037,27 +1024,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: add_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -1315,11 +1299,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1350,11 +1332,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1381,11 +1361,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1064_DPP-LABEL: add_i32_varying_nouse: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1414,11 +1391,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1032_DPP-LABEL: add_i32_varying_nouse: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1442,34 +1416,32 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_DPP-LABEL: add_i32_varying_nouse: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164_DPP-NEXT: ; %bb.1: @@ -1482,26 +1454,24 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_DPP-LABEL: add_i32_varying_nouse: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132_DPP-NEXT: ; %bb.1: @@ -2398,15 +2368,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2493,15 +2457,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2585,47 +2543,41 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 @@ -2689,43 +2641,37 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -2734,10 +2680,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 @@ -2779,21 +2725,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 @@ -2891,23 +2831,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -3235,15 +3170,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3312,15 +3241,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3386,23 +3309,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3451,22 +3368,18 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3507,21 +3420,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v6, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v6 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v7 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 @@ -3575,22 +3482,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v6 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v7 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s0 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -4400,14 +4302,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4447,14 +4347,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4490,13 +4388,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: sub_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4545,13 +4440,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: sub_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4591,33 +4483,30 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: sub_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -4657,27 +4546,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: sub_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -4935,11 +4821,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4970,11 +4854,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5001,11 +4883,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1064_DPP-LABEL: sub_i32_varying_nouse: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5034,11 +4913,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1032_DPP-LABEL: sub_i32_varying_nouse: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5062,34 +4938,32 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_DPP-LABEL: sub_i32_varying_nouse: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164_DPP-NEXT: ; %bb.1: @@ -5102,26 +4976,24 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_DPP-LABEL: sub_i32_varying_nouse: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132_DPP-NEXT: ; %bb.1: @@ -6044,15 +5916,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6139,15 +6005,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6231,47 +6091,41 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 @@ -6335,43 +6189,37 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -6380,10 +6228,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 @@ -6425,21 +6273,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 @@ -6537,23 +6379,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -6943,13 +6780,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -6992,13 +6825,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7037,13 +6866,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: and_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -7092,13 +6918,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: and_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -7138,33 +6961,30 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: and_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -7204,31 +7024,29 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: and_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -7619,16 +7437,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 @@ -7683,16 +7495,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 @@ -7741,19 +7547,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: and_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1] ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf @@ -7822,19 +7624,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: and_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf @@ -7846,11 +7644,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 @@ -7885,47 +7683,43 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: and_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7981,43 +7775,39 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: and_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4 ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 @@ -8375,14 +8165,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8422,14 +8210,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8465,13 +8251,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: or_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8520,13 +8303,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: or_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8566,33 +8346,30 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: or_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -8632,27 +8409,24 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: or_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -9047,16 +8821,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 @@ -9111,16 +8879,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 @@ -9169,19 +8931,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: or_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9250,19 +9008,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: or_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9274,11 +9028,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 @@ -9313,47 +9067,43 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: or_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -9409,43 +9159,39 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: or_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 @@ -9803,14 +9549,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9850,14 +9594,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9893,13 +9635,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: xor_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9948,13 +9687,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: xor_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9994,33 +9730,30 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: xor_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -10060,27 +9793,24 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: xor_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -10475,16 +10205,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 @@ -10539,16 +10263,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 @@ -10597,19 +10315,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: xor_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10678,19 +10392,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: xor_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10702,11 +10412,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 @@ -10741,47 +10451,43 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: xor_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -10837,43 +10543,39 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: xor_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 @@ -11232,12 +10934,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, -1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11281,12 +10982,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, -1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11325,13 +11025,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: max_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -11380,13 +11077,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: max_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -11426,33 +11120,30 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: max_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -11492,31 +11183,29 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: max_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -12195,19 +11884,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: max_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX8_DPP-NEXT: s_mov_b32 s0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: s_brev_b32 s1, 1 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX8_DPP-NEXT: s_mov_b64 exec, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -12294,19 +11983,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: max_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: s_mov_b32 s0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: s_brev_b32 s1, 1 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX9_DPP-NEXT: s_mov_b64 exec, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -12393,20 +12082,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: max_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_mov_b32 s0, 0 -; GFX1064_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1064_DPP-NEXT: s_mov_b32 s4, 0 +; GFX1064_DPP-NEXT: s_brev_b32 s5, 1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s5 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s4, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s5, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12515,20 +12198,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s0, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s1, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo @@ -12610,77 +12287,70 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_readlane_b32 s0, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s1, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -12744,55 +12414,48 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 @@ -13158,12 +12821,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, -1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13207,12 +12869,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, -1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13251,13 +12912,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: min_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -13306,13 +12964,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: min_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -13352,33 +13007,30 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: min_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -13418,31 +13070,29 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: min_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -14124,16 +13774,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: s_mov_b32 s6, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_mov_b32 s6, -1 ; GFX8_DPP-NEXT: s_brev_b32 s7, -2 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX8_DPP-NEXT: s_mov_b64 exec, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -14221,16 +13871,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: s_mov_b32 s6, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 ; GFX9_DPP-NEXT: s_brev_b32 s7, -2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX9_DPP-NEXT: s_mov_b64 exec, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -14321,14 +13971,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14433,17 +14077,11 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s6 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s7 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14530,77 +14168,70 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s6 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s7 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -14663,56 +14294,49 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s6 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s7 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -15076,14 +14700,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15123,14 +14745,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15166,13 +14786,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: umax_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15221,13 +14838,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umax_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15267,33 +14881,30 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: umax_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -15333,27 +14944,24 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: umax_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -16033,14 +15641,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -16131,14 +15733,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -16227,13 +15823,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -16337,20 +15929,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -16433,77 +16019,70 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -16564,56 +16143,49 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 @@ -16978,13 +16550,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17027,13 +16595,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17072,13 +16636,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: umin_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -17127,13 +16688,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umin_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -17173,33 +16731,30 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: umin_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -17239,31 +16794,29 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: umin_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -17939,14 +17492,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -18037,14 +17584,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -18133,13 +17674,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -18243,20 +17780,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -18339,77 +17870,70 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -18470,56 +17994,49 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 22eb8d05b5ff..429e6c489bf6 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -277,11 +277,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX8-NEXT: s_mov_b64 exec, s[10:11] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[10:11] +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -334,11 +332,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX9-NEXT: s_mov_b64 exec, s[10:11] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[10:11] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -386,13 +382,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX1064-NEXT: s_cbranch_execz .LBB1_4 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[10:11] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -449,13 +442,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 ; GFX1032-NEXT: s_cbranch_execz .LBB1_4 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, v0, s9 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -503,32 +493,30 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX1164-NEXT: s_cbranch_execz .LBB1_4 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[10:11] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_permlanex16_b32 v2, v1, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_readlane_b32 s12, v1, 31 -; GFX1164-NEXT: v_mov_b32_e32 v2, s12 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s12 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s12, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s13, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_writelane_b32 v3, s12, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[10:11] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GFX1164-NEXT: v_readlane_b32 s12, v1, 63 @@ -577,31 +565,29 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: s_and_saveexec_b32 s8, s9 ; GFX1132-NEXT: s_cbranch_execz .LBB1_4 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, v0, s9 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v1, -1, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_readlane_b32 s11, v1, 31 ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_readlane_b32 s10, v1, 15 ; GFX1132-NEXT: s_mov_b32 exec_lo, s9 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_writelane_b32 v3, s10, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s9 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll index 0d74bd39b56f..7aca63d34f51 100644 --- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll +++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll @@ -12,12 +12,7 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: s_mov_b32 exec_lo, s4 ; GCN-NEXT: s_or_saveexec_b32 s4, -1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_mov_b32 exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s4, -1 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s4 ; GCN-NEXT: v_mov_b32_e32 v5, 0 @@ -27,12 +22,7 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: ; %bb.1: ; %if ; GCN-NEXT: s_or_saveexec_b32 s5, -1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_mov_b32 exec_lo, s5 -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s5, -1 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s5 ; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s5 ; GCN-NEXT: v_mov_b32_e32 v5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll index 82dc6d21cfe3..310f32ce8f83 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll @@ -6,16 +6,13 @@ define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) { ; GCN-LABEL: wwm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s7, s4 ; GCN-NEXT: s_mov_b32 s6, s3 ; GCN-NEXT: s_mov_b32 s5, s2 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: s_mov_b32 s7, s4 ; GCN-NEXT: s_mov_b32 s4, s1 ; GCN-NEXT: s_mov_b32 s1, 1 -; GCN-NEXT: v_mov_b32_e32 v0, 4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 4, s[2:3] ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s0, 0 @@ -63,16 +60,13 @@ work: define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) { ; GCN-LABEL: strict_wwm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s7, s4 ; GCN-NEXT: s_mov_b32 s6, s3 ; GCN-NEXT: s_mov_b32 s5, s2 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: s_mov_b32 s7, s4 ; GCN-NEXT: s_mov_b32 s4, s1 ; GCN-NEXT: s_mov_b32 s1, 1 -; GCN-NEXT: v_mov_b32_e32 v0, 4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 4, s[2:3] ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 584b280cefb8..311c60929188 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -816,12 +816,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -900,14 +898,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -981,14 +974,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -1046,43 +1034,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -1113,15 +1096,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2049,12 +2027,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2133,14 +2109,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2214,14 +2185,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2279,43 +2245,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -2346,15 +2307,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3342,12 +3298,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3426,14 +3380,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3507,14 +3456,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3572,43 +3516,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -3639,15 +3578,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4131,12 +4065,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4215,14 +4147,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -4296,14 +4223,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -4361,43 +4283,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -4428,15 +4345,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5449,12 +5361,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -5533,14 +5443,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -5614,14 +5519,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -5679,43 +5579,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -5759,15 +5654,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7442,14 +7332,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -7579,15 +7463,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -7707,15 +7585,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -7819,17 +7691,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -7864,11 +7731,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -7946,16 +7813,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9047,14 +8909,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -9152,15 +9008,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9246,15 +9096,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9324,17 +9168,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -9366,9 +9205,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -9419,16 +9259,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10497,14 +10332,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -10602,15 +10431,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10696,15 +10519,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10774,17 +10591,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -10816,9 +10628,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -10869,16 +10682,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11429,14 +11237,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -11534,15 +11336,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11628,15 +11424,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11706,17 +11496,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -11748,9 +11533,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -11801,16 +11587,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -13526,14 +13307,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -13663,15 +13438,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13791,15 +13560,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13903,17 +13666,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -13948,11 +13706,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -14030,16 +13788,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 464ec088dc29..9dc82b17bd3f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -718,12 +718,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -811,15 +809,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -888,15 +881,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -947,46 +935,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -1023,42 +1006,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -1777,12 +1756,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -1870,15 +1847,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -1947,15 +1919,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2006,46 +1973,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -2082,42 +2044,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -2836,12 +2794,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2929,15 +2885,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3006,15 +2957,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3065,46 +3011,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -3141,42 +3082,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -4807,14 +4744,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -4953,15 +4884,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5091,15 +5016,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5211,17 +5130,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -5266,11 +5180,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -5350,16 +5264,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -6281,14 +6190,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -6395,15 +6298,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6484,15 +6381,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6555,17 +6446,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -6607,9 +6493,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -6663,16 +6550,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -8358,14 +8240,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -8504,15 +8380,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8642,15 +8512,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8762,17 +8626,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -8817,11 +8676,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -8901,16 +8760,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 26a0e34d18bd..945583c88ce2 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -718,12 +718,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -811,15 +809,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -888,15 +881,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -947,46 +935,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -1023,42 +1006,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -1777,12 +1756,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -1870,15 +1847,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -1947,15 +1919,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2006,46 +1973,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -2082,42 +2044,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -2836,12 +2794,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2929,15 +2885,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3006,15 +2957,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3065,46 +3011,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -3141,42 +3082,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -4807,14 +4744,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -4953,15 +4884,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5091,15 +5016,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5211,17 +5130,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -5266,11 +5180,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -5350,16 +5264,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -6281,14 +6190,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -6395,15 +6298,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6484,15 +6381,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6555,17 +6446,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -6607,9 +6493,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -6663,16 +6550,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -8358,14 +8240,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -8504,15 +8380,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8642,15 +8512,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8762,17 +8626,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -8817,11 +8676,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -8901,16 +8760,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index c158a8007bcc..3bc0f2546794 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -894,12 +894,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -978,14 +976,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -1059,14 +1052,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -1124,43 +1112,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -1204,15 +1187,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2239,12 +2217,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2323,14 +2299,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2404,14 +2375,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2469,43 +2435,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -2549,15 +2510,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3584,12 +3540,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3668,14 +3622,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3749,14 +3698,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3814,43 +3758,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -3894,15 +3833,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4425,12 +4359,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4509,14 +4441,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -4590,14 +4517,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -4655,43 +4577,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -4735,15 +4652,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5769,12 +5681,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -5853,14 +5763,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -5934,14 +5839,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -5999,43 +5899,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -6079,15 +5974,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7762,14 +7652,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -7899,15 +7783,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8027,15 +7905,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8139,17 +8011,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -8184,11 +8051,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -8266,16 +8133,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9366,14 +9228,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -9471,15 +9327,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9565,15 +9415,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9643,17 +9487,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -9685,9 +9524,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -9738,16 +9578,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10816,14 +10651,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -10921,15 +10750,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11015,15 +10838,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11093,17 +10910,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -11135,9 +10947,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -11188,16 +11001,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11748,14 +11556,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -11853,15 +11655,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11947,15 +11743,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -12025,17 +11815,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -12067,9 +11852,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -12120,16 +11906,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -13844,14 +13625,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -13981,15 +13756,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -14109,15 +13878,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -14221,17 +13984,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -14266,11 +14024,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -14348,16 +14106,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll index b3acd4949301..c1b58f1795aa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll @@ -12,97 +12,204 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 % ; GFX11-LABEL: set_inactive_chain_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v11 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: v_mov_b32_e32 v0, v10 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11-NEXT: global_store_b32 v[8:9], v0, off +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: global_store_b32 v[8:9], v1, off ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: set_inactive_chain_arg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, v10 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-NEXT: global_store_dword v[8:9], v0, off +; GFX10-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0 +; GFX10-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: global_store_dword v[8:9], v1, off ; GFX10-NEXT: s_endpgm ; ; GFX11_W64-LABEL: set_inactive_chain_arg: ; GFX11_W64: ; %bb.0: ; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11_W64-NEXT: v_mov_b32_e32 v0, v11 -; GFX11_W64-NEXT: s_not_b64 exec, exec +; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10 -; GFX11_W64-NEXT: s_not_b64 exec, exec -; GFX11_W64-NEXT: global_store_b32 v[8:9], v0, off +; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] +; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11_W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11_W64-NEXT: global_store_b32 v[8:9], v1, off ; GFX11_W64-NEXT: s_endpgm ; ; GFX10_W64-LABEL: set_inactive_chain_arg: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v0, v11 -; GFX10_W64-NEXT: s_not_b64 exec, exec +; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10 -; GFX10_W64-NEXT: s_not_b64 exec, exec -; GFX10_W64-NEXT: global_store_dword v[8:9], v0, off +; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] +; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX10_W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10_W64-NEXT: global_store_dword v[8:9], v1, off ; GFX10_W64-NEXT: s_endpgm %tmp = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0 - store i32 %tmp, ptr addrspace(1) %out + %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp) + store i32 %wwm, ptr addrspace(1) %out ret void } define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i64 %inactive, i64 %active) { -; GFX11-LABEL: set_inactive_chain_arg_64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v12 -; GFX11-NEXT: v_mov_b32_e32 v1, v13 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11-NEXT: v_mov_b32_e32 v0, v10 -; GFX11-NEXT: v_mov_b32_e32 v1, v11 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11-NEXT: global_store_b64 v[8:9], v[0:1], off -; GFX11-NEXT: s_endpgm +; GISEL11-LABEL: set_inactive_chain_arg_64: +; GISEL11: ; %bb.0: +; GISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_mov_b32 v1, v11 +; GISEL11-NEXT: s_mov_b32 exec_lo, s0 +; GISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 +; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; GISEL11-NEXT: s_mov_b32 exec_lo, s0 +; GISEL11-NEXT: v_mov_b32_e32 v2, v0 +; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL11-NEXT: v_mov_b32_e32 v3, v1 +; GISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off +; GISEL11-NEXT: s_endpgm ; -; GFX10-LABEL: set_inactive_chain_arg_64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v12 -; GFX10-NEXT: v_mov_b32_e32 v1, v13 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v10 -; GFX10-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-NEXT: global_store_dwordx2 v[8:9], v[0:1], off -; GFX10-NEXT: s_endpgm +; DAGISEL11-LABEL: set_inactive_chain_arg_64: +; DAGISEL11: ; %bb.0: +; DAGISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10 +; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; DAGISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; DAGISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL11-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DAGISEL11-NEXT: v_mov_b32_e32 v3, v1 +; DAGISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off +; DAGISEL11-NEXT: s_endpgm ; -; GFX11_W64-LABEL: set_inactive_chain_arg_64: -; GFX11_W64: ; %bb.0: -; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11_W64-NEXT: v_mov_b32_e32 v0, v12 -; GFX11_W64-NEXT: v_mov_b32_e32 v1, v13 -; GFX11_W64-NEXT: s_not_b64 exec, exec -; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10 -; GFX11_W64-NEXT: v_mov_b32_e32 v1, v11 -; GFX11_W64-NEXT: s_not_b64 exec, exec -; GFX11_W64-NEXT: global_store_b64 v[8:9], v[0:1], off -; GFX11_W64-NEXT: s_endpgm +; GISEL10-LABEL: set_inactive_chain_arg_64: +; GISEL10: ; %bb.0: +; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL10-NEXT: v_mov_b32_e32 v0, v10 +; GISEL10-NEXT: v_mov_b32_e32 v1, v11 +; GISEL10-NEXT: s_mov_b32 exec_lo, s0 +; GISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 +; GISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; GISEL10-NEXT: s_mov_b32 exec_lo, s0 +; GISEL10-NEXT: v_mov_b32_e32 v2, v0 +; GISEL10-NEXT: v_mov_b32_e32 v3, v1 +; GISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; GISEL10-NEXT: s_endpgm ; -; GFX10_W64-LABEL: set_inactive_chain_arg_64: -; GFX10_W64: ; %bb.0: -; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v0, v12 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, v13 -; GFX10_W64-NEXT: s_not_b64 exec, exec -; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, v11 -; GFX10_W64-NEXT: s_not_b64 exec, exec -; GFX10_W64-NEXT: global_store_dwordx2 v[8:9], v[0:1], off -; GFX10_W64-NEXT: s_endpgm +; DAGISEL10-LABEL: set_inactive_chain_arg_64: +; DAGISEL10: ; %bb.0: +; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL10-NEXT: v_mov_b32_e32 v1, v11 +; DAGISEL10-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL10-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL10-NEXT: v_mov_b32_e32 v3, v1 +; DAGISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; DAGISEL10-NEXT: s_endpgm +; +; GISEL11_W64-LABEL: set_inactive_chain_arg_64: +; GISEL11_W64: ; %bb.0: +; GISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v10 +; GISEL11_W64-NEXT: v_mov_b32_e32 v1, v11 +; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL11_W64-NEXT: v_mov_b32_e32 v2, v0 +; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL11_W64-NEXT: v_mov_b32_e32 v3, v1 +; GISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off +; GISEL11_W64-NEXT: s_endpgm +; +; DAGISEL11_W64-LABEL: set_inactive_chain_arg_64: +; DAGISEL11_W64: ; %bb.0: +; DAGISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, v11 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v3, v1 +; DAGISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off +; DAGISEL11_W64-NEXT: s_endpgm +; +; GISEL10_W64-LABEL: set_inactive_chain_arg_64: +; GISEL10_W64: ; %bb.0: +; GISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v10 +; GISEL10_W64-NEXT: v_mov_b32_e32 v1, v11 +; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; GISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL10_W64-NEXT: v_mov_b32_e32 v2, v0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v3, v1 +; GISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; GISEL10_W64-NEXT: s_endpgm +; +; DAGISEL10_W64-LABEL: set_inactive_chain_arg_64: +; DAGISEL10_W64: ; %bb.0: +; DAGISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, v11 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v3, v1 +; DAGISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; DAGISEL10_W64-NEXT: s_endpgm %tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0 - store i64 %tmp, ptr addrspace(1) %out + %wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp) + store i64 %wwm, ptr addrspace(1) %out ret void } @@ -113,16 +220,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: v_mov_b32_e32 v0, v10 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v11 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v0 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-NEXT: global_store_b32 v[8:9], v2, off ; GFX11-NEXT: s_endpgm @@ -133,11 +237,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX10-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, v10 ; GFX10-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v0 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX10-NEXT: s_mov_b32 exec_lo, s0 @@ -151,17 +252,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10 ; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1] -; GFX11_W64-NEXT: v_mov_b32_e32 v0, v11 -; GFX11_W64-NEXT: s_not_b64 exec, exec -; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11_W64-NEXT: v_mov_b32_e32 v0, v0 -; GFX11_W64-NEXT: s_not_b64 exec, exec ; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11_W64-NEXT: s_waitcnt_depctr 0xfff +; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11_W64-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1] -; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11_W64-NEXT: v_mov_b32_e32 v2, v1 ; GFX11_W64-NEXT: global_store_b32 v[8:9], v2, off ; GFX11_W64-NEXT: s_endpgm @@ -172,11 +269,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10 ; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1] -; GFX10_W64-NEXT: v_mov_b32_e32 v0, v11 -; GFX10_W64-NEXT: s_not_b64 exec, exec -; GFX10_W64-NEXT: v_mov_b32_e32 v0, v0 -; GFX10_W64-NEXT: s_not_b64 exec, exec ; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1] @@ -214,11 +308,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL11-NEXT: v_mov_b32_e32 v11, 0 ; GISEL11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL11-NEXT: v_mov_b32_e32 v12, v43 -; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL11-NEXT: v_mov_b32_e32 v12, v40 -; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; GISEL11-NEXT: s_mov_b32 exec_lo, s0 ; GISEL11-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11-NEXT: global_store_b32 v[41:42], v0, off ; GISEL11-NEXT: s_endpgm @@ -244,11 +337,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0 ; DAGISEL11-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; DAGISEL11-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL11-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off ; DAGISEL11-NEXT: s_endpgm @@ -283,10 +375,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GISEL10-NEXT: v_mov_b32_e32 v12, v43 -; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL10-NEXT: v_mov_b32_e32 v12, v40 -; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; GISEL10-NEXT: s_mov_b32 exec_lo, s0 ; GISEL10-NEXT: v_mov_b32_e32 v0, v12 ; GISEL10-NEXT: global_store_dword v[41:42], v0, off ; GISEL10-NEXT: s_endpgm @@ -321,10 +412,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; DAGISEL10-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL10-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL10-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL10-NEXT: global_store_dword v[41:42], v0, off ; DAGISEL10-NEXT: s_endpgm @@ -357,11 +447,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 ; GISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 -; GISEL11_W64-NEXT: s_not_b64 exec, exec -; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 -; GISEL11_W64-NEXT: s_not_b64 exec, exec -; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off ; GISEL11_W64-NEXT: s_endpgm @@ -394,11 +483,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 ; DAGISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL11_W64-NEXT: s_not_b64 exec, exec -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL11_W64-NEXT: s_not_b64 exec, exec -; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off ; DAGISEL11_W64-NEXT: s_endpgm @@ -433,10 +521,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 -; GISEL10_W64-NEXT: s_not_b64 exec, exec -; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 -; GISEL10_W64-NEXT: s_not_b64 exec, exec +; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL10_W64-NEXT: global_store_dword v[41:42], v0, off ; GISEL10_W64-NEXT: s_endpgm @@ -471,10 +558,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL10_W64-NEXT: s_not_b64 exec, exec -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL10_W64-NEXT: s_not_b64 exec, exec +; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL10_W64-NEXT: global_store_dword v[41:42], v0, off ; DAGISEL10_W64-NEXT: s_endpgm @@ -511,11 +597,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0 ; GISEL11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL11-NEXT: v_mov_b32_e32 v12, v43 -; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL11-NEXT: v_mov_b32_e32 v12, v40 -; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; GISEL11-NEXT: s_mov_b32 exec_lo, s0 ; GISEL11-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11-NEXT: global_store_b32 v[41:42], v0, off ; GISEL11-NEXT: s_endpgm @@ -541,11 +626,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL11-NEXT: v_mov_b32_e32 v11, 0 ; DAGISEL11-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; DAGISEL11-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL11-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off ; DAGISEL11-NEXT: s_endpgm @@ -580,10 +664,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GISEL10-NEXT: v_mov_b32_e32 v12, v43 -; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL10-NEXT: v_mov_b32_e32 v12, v40 -; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; GISEL10-NEXT: s_mov_b32 exec_lo, s0 ; GISEL10-NEXT: v_mov_b32_e32 v0, v12 ; GISEL10-NEXT: global_store_dword v[41:42], v0, off ; GISEL10-NEXT: s_endpgm @@ -618,10 +701,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; DAGISEL10-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL10-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL10-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL10-NEXT: global_store_dword v[41:42], v0, off ; DAGISEL10-NEXT: s_endpgm @@ -654,11 +736,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 ; GISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 -; GISEL11_W64-NEXT: s_not_b64 exec, exec -; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 -; GISEL11_W64-NEXT: s_not_b64 exec, exec -; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off ; GISEL11_W64-NEXT: s_endpgm @@ -691,11 +772,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 ; DAGISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL11_W64-NEXT: s_not_b64 exec, exec -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL11_W64-NEXT: s_not_b64 exec, exec -; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off ; DAGISEL11_W64-NEXT: s_endpgm @@ -730,10 +810,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 -; GISEL10_W64-NEXT: s_not_b64 exec, exec -; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 -; GISEL10_W64-NEXT: s_not_b64 exec, exec +; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL10_W64-NEXT: global_store_dword v[41:42], v0, off ; GISEL10_W64-NEXT: s_endpgm @@ -768,10 +847,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL10_W64-NEXT: s_not_b64 exec, exec -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL10_W64-NEXT: s_not_b64 exec, exec +; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL10_W64-NEXT: global_store_dword v[41:42], v0, off ; DAGISEL10_W64-NEXT: s_endpgm @@ -786,6 +864,7 @@ declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) declare i32 @llvm.amdgcn.strict.wwm.i32(i32) +declare i64 @llvm.amdgcn.strict.wwm.i64(i64) declare amdgpu_gfx void @gfx_callee(<12 x i32>) attributes #0 = { convergent readnone willreturn nocallback nofree} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 114d2d099ab7..6dc4a2ce0504 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,18 +5,22 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) store i32 %tmp, ptr addrspace(1) %out ret void } @@ -25,13 +29,15 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) store i32 %tmp, ptr addrspace(1) %out ret void } @@ -40,20 +46,25 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) store i64 %tmp, ptr addrspace(1) %out ret void } @@ -63,13 +74,16 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) store i64 %tmp, ptr addrspace(1) %out ret void } @@ -82,12 +96,15 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, 56 +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc1 .LBB4_3 ; GCN-NEXT: ; %bb.1: ; %Flow @@ -96,19 +113,20 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-NEXT: .LBB4_2: ; %.exit ; GCN-NEXT: s_endpgm ; GCN-NEXT: .LBB4_3: ; %.one -; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GCN-NEXT: s_cbranch_execnz .LBB4_2 ; GCN-NEXT: .LBB4_4: ; %.zero ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) %cmp = icmp eq i32 %val, 56 - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) br i1 %cmp, label %.zero, label %.one .zero: @@ -127,19 +145,23 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s7, 0x40400000 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s5, 0x40400000 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0 + %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0 + %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0) store float %tmp, ptr addrspace(1) %out ret void } @@ -148,22 +170,27 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mov_b32 s0, 0xcccccccd -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s1, 0x4010cccc -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0 + %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0 + %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0) store double %tmp, ptr addrspace(1) %out ret void } @@ -171,19 +198,23 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s7, 0x10001 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s5, 0x10001 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> ) #0 + %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> ) #0 + %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0) store <2 x i16> %tmp, ptr addrspace(1) %out ret void } @@ -191,19 +222,23 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s7, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s5, 0x3c003c00 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> ) #0 + %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> ) #0 + %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0) store <2 x half> %tmp, ptr addrspace(1) %out ret void } @@ -212,22 +247,27 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s8, 1 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 1 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> ) #0 + %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> ) #0 + %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0) store <2 x i32> %tmp, ptr addrspace(1) %out ret void } @@ -236,22 +276,27 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s8, 1.0 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 1.0 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> ) #0 + %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> ) #0 + %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0) store <2 x float> %tmp, ptr addrspace(1) %out ret void } @@ -259,19 +304,23 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s7, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s5, 0x3f803f80 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> ) #0 + %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> ) #0 + %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0) store <2 x bfloat> %tmp, ptr addrspace(1) %out ret void } @@ -280,22 +329,27 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s8, 0x10001 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 0x10001 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 + %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 + %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0) store <4 x i16> %tmp, ptr addrspace(1) %out ret void } @@ -304,22 +358,27 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s8, 0x3c003c00 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 0x3c003c00 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 + %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 + %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0) store <4 x half> %tmp, ptr addrspace(1) %out ret void } @@ -328,22 +387,27 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s8, 0x3f803f80 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 0x3f803f80 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 + %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 + %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0) store <4 x bfloat> %tmp, ptr addrspace(1) %out ret void } @@ -352,20 +416,25 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0 + %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0 + %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0) store ptr %tmp, ptr addrspace(1) %out ret void } @@ -373,18 +442,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0 + %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0 + %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0) store ptr addrspace(2) %tmp, ptr addrspace(1) %out ret void } @@ -392,18 +465,22 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0 + %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0 + %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0) store ptr addrspace(3) %tmp, ptr addrspace(1) %out ret void } @@ -411,18 +488,22 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0 + %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0 + %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0) store ptr addrspace(5) %tmp, ptr addrspace(1) %out ret void } @@ -430,24 +511,31 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0 + %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0 + %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0) store ptr addrspace(6) %tmp, ptr addrspace(1) %out ret void } declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1 +declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) attributes #0 = { convergent readnone } +attributes #1 = { convergent nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index 81858bd3d29e..f60786c1bacb 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -15,11 +15,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %.then -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: s_or_saveexec_b32 s1, -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s1 @@ -82,12 +79,7 @@ define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrs ; GCN-NEXT: .LBB1_5: ; %.else ; GCN-NEXT: s_or_saveexec_b32 s1, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b32 exec_lo, s1 -; GCN-NEXT: v_mov_b32_e32 v2, v3 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s1, -1 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v3, s1 ; GCN-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s1 ; GCN-NEXT: v_mov_b32_e32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll index 09e342fe1906..90b32e29e98f 100644 --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -23,11 +23,8 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i ; GCN-NEXT: s_cbranch_execz .LBB0_1 ; GCN-NEXT: ; %bb.3: ; %bb1 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: s_or_saveexec_b32 s9, -1 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, s4, s9 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s9 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index c3a81771a279..ff692acda3c2 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1674,13 +1674,13 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v0, 42 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 42, s4, s2 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: global_store_dword v1, v2, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive: @@ -1688,15 +1688,16 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v0, 42 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 42, s4, s[2:3] +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: global_store_dword v1, v2, s[0:1] ; GFX1064-NEXT: s_endpgm - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) store i32 %tmp, ptr addrspace(1) %out ret void } @@ -1705,31 +1706,32 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; GFX1032-LABEL: test_set_inactive_64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, s6 -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s7, s0 +; GFX1032-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive_64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064-NEXT: v_mov_b32_e32 v1, s7 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s7, s[0:1] +; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX1064-NEXT: s_endpgm - %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) store i64 %tmp, ptr addrspace(1) %out ret void } @@ -2921,6 +2923,8 @@ declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) declare float @llvm.amdgcn.strict.wwm.f32(float) +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) +declare i64 @llvm.amdgcn.strict.wwm.i64(i64) declare float @llvm.amdgcn.wwm.f32(float) declare i32 @llvm.amdgcn.wqm.i32(i32) declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 6b4c2da772cd..ab84c0c90577 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -835,12 +835,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: s_not_b64 exec, exec -; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] ; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 @@ -851,12 +848,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 @@ -1317,7 +1311,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_nop 0 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen -; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec +; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -1334,7 +1328,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { ; GFX10-W32-NEXT: s_clause 0x1 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec +; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2263,11 +2257,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-W64-NEXT: s_not_b64 exec, exec -; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) @@ -2293,11 +2284,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) @@ -2744,12 +2732,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: s_not_b64 exec, exec -; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] ; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 @@ -2760,12 +2745,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 @@ -2799,11 +2781,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-W64-NEXT: s_not_b64 exec, exec -; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) @@ -2829,11 +2808,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir index ef6d0780f395..64a7c4457395 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -40,6 +40,9 @@ define amdgpu_vs void @no_wqm_in_vs() { ret void } + define amdgpu_ps void @preloaded_set_inactive() { + ret void + } ... --- @@ -282,10 +285,10 @@ body: | # #CHECK-NOT: ENTER_STRICT_WWM #CHECK: BUFFER_LOAD_DWORDX2 -#CHECK-NOT: ENTER_STRICT_WWM -#CHECK: V_SET_INACTIVE_B32 -#CHECK: V_SET_INACTIVE_B32 #CHECK: ENTER_STRICT_WWM +#CHECK: V_SET_INACTIVE_B32 +#CHECK: V_SET_INACTIVE_B32 +#CHECK-NOT: ENTER_STRICT_WWM #CHECK: V_MAX name: test_wwm_set_inactive_propagation tracksRegLiveness: true @@ -443,3 +446,19 @@ body: | %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) ... + +--- +# Preserve V_SET_INACTIVE with exec mask already specified +#CHECK-LABEL: name: preloaded_set_inactive +#CHECK: V_SET_INACTIVE_B32 +name: preloaded_set_inactive +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr2 + + %0:vgpr_32 = COPY $vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %mask:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %value:vgpr_32 = V_SET_INACTIVE_B32 %0:vgpr_32, %1:vgpr_32, implicit $exec, implicit-def $scc, implicit %mask:sreg_64 +... diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index e79cb66dcd77..47e1897f6b42 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -30,15 +30,15 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -80,17 +80,10 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[34:35] +; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 @@ -177,11 +170,11 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -208,12 +201,8 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[34:35] +; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -270,34 +259,25 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[34:35] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[36:37] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB1_2: ; %merge +; GFX9-O3-NEXT: ; %bb.2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -378,26 +358,26 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-O0-NEXT: s_mov_b32 s40, s6 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: s_mov_b32 s42, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 -; GFX9-O0-NEXT: s_mov_b32 s41, s7 -; GFX9-O0-NEXT: s_mov_b32 s42, s41 -; GFX9-O0-NEXT: s_mov_b32 s43, s40 +; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 +; GFX9-O0-NEXT: s_mov_b32 s43, s7 +; GFX9-O0-NEXT: s_mov_b32 s44, s43 +; GFX9-O0-NEXT: s_mov_b32 s45, s42 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s5 -; GFX9-O0-NEXT: s_mov_b32 s44, s35 +; GFX9-O0-NEXT: s_mov_b32 s46, s35 ; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s44 -; GFX9-O0-NEXT: s_mov_b32 s38, s43 -; GFX9-O0-NEXT: s_mov_b32 s39, s42 +; GFX9-O0-NEXT: s_mov_b32 s37, s46 +; GFX9-O0-NEXT: s_mov_b32 s38, s45 +; GFX9-O0-NEXT: s_mov_b32 s39, s44 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_getpc_b64 s[42:43] ; GFX9-O0-NEXT: s_add_u32 s42, s42, strict_wwm_called@rel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s43, s43, strict_wwm_called@rel32@hi+12 @@ -437,11 +417,11 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] ; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4 @@ -559,7 +539,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s48, s33 +; GFX9-O0-NEXT: s_mov_b32 s50, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill @@ -583,41 +563,41 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[38:39], -1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 -; GFX9-O0-NEXT: s_mov_b32 s38, s6 +; GFX9-O0-NEXT: s_mov_b32 s40, s6 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: s_mov_b32 s35, s39 -; GFX9-O0-NEXT: s_mov_b32 s44, s38 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_mov_b32 s41, s7 +; GFX9-O0-NEXT: s_mov_b32 s35, s41 +; GFX9-O0-NEXT: s_mov_b32 s42, s40 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 -; GFX9-O0-NEXT: s_mov_b32 s45, s37 -; GFX9-O0-NEXT: s_mov_b32 s40, s36 -; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43 -; GFX9-O0-NEXT: s_mov_b32 s41, s45 -; GFX9-O0-NEXT: s_mov_b32 s42, s44 -; GFX9-O0-NEXT: s_mov_b32 s43, s35 -; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s43, 3 +; GFX9-O0-NEXT: s_mov_b32 s43, s37 +; GFX9-O0-NEXT: s_mov_b32 s44, s36 +; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47 +; GFX9-O0-NEXT: s_mov_b32 s45, s43 +; GFX9-O0-NEXT: s_mov_b32 s46, s42 +; GFX9-O0-NEXT: s_mov_b32 s47, s35 +; GFX9-O0-NEXT: v_writelane_b32 v0, s44, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s45, 3 +; GFX9-O0-NEXT: v_writelane_b32 v0, s46, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s47, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, s36 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: s_mov_b64 exec, s[38:39] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 @@ -634,20 +614,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 4 -; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 5 -; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 0 -; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 1 -; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2 -; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3 +; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 0 +; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 1 +; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2 +; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4 +; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 @@ -679,14 +659,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 -; GFX9-O0-NEXT: s_mov_b32 s33, s48 +; GFX9-O0-NEXT: s_mov_b32 s33, s50 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O3-LABEL: strict_wwm_call_i64: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O3-NEXT: s_mov_b32 s40, s33 +; GFX9-O3-NEXT: s_mov_b32 s38, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill @@ -702,28 +682,26 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: s_getpc_b64 s[36:37] -; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[36:37], s[36:37], 0x0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-O3-NEXT: s_getpc_b64 s[34:35] +; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[38:39], -1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-O3-NEXT: s_mov_b64 exec, s[38:39] +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4 @@ -739,7 +717,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-O3-NEXT: s_mov_b32 s33, s40 +; GFX9-O3-NEXT: s_mov_b32 s33, s38 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0) @@ -778,16 +756,18 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s40, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 -; GFX9-O0-NEXT: s_mov_b32 s41, s35 +; GFX9-O0-NEXT: s_mov_b32 s42, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 +; GFX9-O0-NEXT: s_mov_b32 s43, s35 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 @@ -796,21 +776,25 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 @@ -851,28 +835,30 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 -; GFX9-O3-NEXT: s_mov_b32 s34, -1 -; GFX9-O3-NEXT: s_brev_b32 s35, -2 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: s_mov_b32 s36, -1 +; GFX9-O3-NEXT: s_brev_b32 s37, -2 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, s36 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s37 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s35 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, s36 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, s37 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s34 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s35 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, s36 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s37 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s34 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s35 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 @@ -922,21 +908,9 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -987,130 +961,110 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v42, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v46, s25 -; GFX9-O0-NEXT: v_mov_b32_e32 v45, s26 -; GFX9-O0-NEXT: v_mov_b32_e32 v44, s27 -; GFX9-O0-NEXT: v_mov_b32_e32 v43, s28 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29 +; GFX9-O0-NEXT: v_mov_b32_e32 v39, s17 +; GFX9-O0-NEXT: v_mov_b32_e32 v38, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v37, s19 +; GFX9-O0-NEXT: v_mov_b32_e32 v36, s20 +; GFX9-O0-NEXT: v_mov_b32_e32 v35, s21 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v46, s23 +; GFX9-O0-NEXT: v_mov_b32_e32 v45, s24 +; GFX9-O0-NEXT: v_mov_b32_e32 v44, s25 +; GFX9-O0-NEXT: v_mov_b32_e32 v43, s26 +; GFX9-O0-NEXT: v_mov_b32_e32 v42, s27 +; GFX9-O0-NEXT: v_mov_b32_e32 v41, s28 +; GFX9-O0-NEXT: v_mov_b32_e32 v40, s29 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v46 -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v45 -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v44 -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v39 +; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v38 +; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v37 +; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v36 +; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v46 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v45 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v44 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v42 +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v41 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v40 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v39 ; GFX9-O0-NEXT: s_waitcnt vmcnt(4) -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v26, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v27, v46 -; GFX9-O0-NEXT: v_mov_b32_e32 v28, v45 -; GFX9-O0-NEXT: v_mov_b32_e32 v29, v44 -; GFX9-O0-NEXT: v_mov_b32_e32 v30, v43 -; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr42 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v27, v38 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v37 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_mov_b32_e32 v29, v36 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v30, v35 +; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr34 killed $exec ; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -1150,62 +1104,82 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 -; GFX9-O0-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v32, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v33, v11 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v34, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v35, v9 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v34, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v35, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v36, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v37, v7 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v36, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v37, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v38, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v39, v5 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v38, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v39, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v40, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v41, v3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v40, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v41, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v33 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v33 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v9 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v33 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v35 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:8 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v37 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v39 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v41 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 @@ -1245,16 +1219,8 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -1265,73 +1231,56 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; GFX9-O3-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4 ; GFX9-O3-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:8 ; GFX9-O3-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 ; GFX9-O3-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:16 ; GFX9-O3-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; GFX9-O3-NEXT: v_mov_b32_e32 v32, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v33, v2 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v32, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v33, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v34, v3 -; GFX9-O3-NEXT: v_mov_b32_e32 v35, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v34, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v35, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v36, v5 -; GFX9-O3-NEXT: v_mov_b32_e32 v37, v6 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v36, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v37, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v38, v7 -; GFX9-O3-NEXT: v_mov_b32_e32 v39, v8 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v38, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v39, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v40, v9 -; GFX9-O3-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v40, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v41, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:4 -; GFX9-O3-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen -; GFX9-O3-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:12 -; GFX9-O3-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:8 -; GFX9-O3-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:20 -; GFX9-O3-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:16 -; GFX9-O3-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:28 -; GFX9-O3-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:24 -; GFX9-O3-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:36 -; GFX9-O3-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:32 -; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v1, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v2, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v32 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v33 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v3, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v4, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v32 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v33 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v5, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v5, v32 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, v33 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v8, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v7, v32 +; GFX9-O3-NEXT: v_mov_b32_e32 v8, v33 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v9, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v10, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v9, v32 +; GFX9-O3-NEXT: v_mov_b32_e32 v10, v33 +; GFX9-O3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-O3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-O3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX9-O3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX9-O3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX9-O3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-O3-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 +; GFX9-O3-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; GFX9-O3-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 +; GFX9-O3-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s6 @@ -1359,24 +1308,21 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O3-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-O3-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] - %a2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0) - %b2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0) - %c2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %c, i64 0) - %d2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %d, i64 0) - %e2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %e, i64 0) + %a2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0) + %a2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %a2.i) + %b2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0) + %b2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %b2.i) + %c2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %c, i64 0) + %c2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %c2.i) + %d2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %d, i64 0) + %d2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %d2.i) + %e2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %e, i64 0) + %e2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %e2.i) store i64 %a2, ptr addrspace(5) %ptr %ptr_b = getelementptr i64, ptr addrspace(5) %ptr, i32 1 store i64 %b2, ptr addrspace(5) %ptr_b diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index a74dbe1de0d3..7f0db3e362b3 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -26,15 +26,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -63,17 +63,10 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5] +; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 @@ -154,11 +147,11 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -185,12 +178,8 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -236,34 +225,25 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB1_2: ; %merge +; GFX9-O3-NEXT: ; %bb.2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -361,35 +341,35 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s3, s7 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s9 -; GFX9-O0-NEXT: s_mov_b32 s16, s8 -; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s7 -; GFX9-O0-NEXT: s_mov_b32 s18, s6 -; GFX9-O0-NEXT: s_mov_b32 s19, s3 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_mov_b32 s3, s9 +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s9, s17 +; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 +; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 +; GFX9-O0-NEXT: s_mov_b32 s17, s9 +; GFX9-O0-NEXT: s_mov_b32 s18, s8 +; GFX9-O0-NEXT: s_mov_b32 s19, s3 +; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 +; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 ; GFX9-O0-NEXT: s_mov_b32 s3, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 8 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9 ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -418,13 +398,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6 +; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7 +; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9 +; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4 +; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -454,12 +434,12 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 @@ -613,35 +593,35 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: s_mov_b32 s8, s17 +; GFX9-O0-NEXT: s_mov_b32 s6, s19 +; GFX9-O0-NEXT: s_mov_b32 s7, s18 +; GFX9-O0-NEXT: s_mov_b32 s15, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s8 +; GFX9-O0-NEXT: s_mov_b32 s17, s15 ; GFX9-O0-NEXT: s_mov_b32 s18, s7 ; GFX9-O0-NEXT: s_mov_b32 s19, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9 +; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 +; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -678,12 +658,12 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6 +; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7 +; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8 +; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -721,14 +701,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: s_getpc_b64 s[2:3] @@ -792,16 +772,18 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s6, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s5 +; GFX9-O0-NEXT: s_mov_b32 s8, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 @@ -810,21 +792,25 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 @@ -848,28 +834,30 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 -; GFX9-O3-NEXT: s_mov_b32 s4, -1 -; GFX9-O3-NEXT: s_brev_b32 s5, -2 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_mov_b32 s6, -1 +; GFX9-O3-NEXT: s_brev_b32 s7, -2 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 @@ -927,15 +915,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -964,17 +952,10 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5] +; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 @@ -1055,11 +1036,11 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -1086,12 +1067,8 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -1137,34 +1114,25 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB8_2: ; %merge +; GFX9-O3-NEXT: ; %bb.2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1262,35 +1230,35 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s3, s7 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s9 -; GFX9-O0-NEXT: s_mov_b32 s16, s8 -; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s7 -; GFX9-O0-NEXT: s_mov_b32 s18, s6 -; GFX9-O0-NEXT: s_mov_b32 s19, s3 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_mov_b32 s3, s9 +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s9, s17 +; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 +; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 +; GFX9-O0-NEXT: s_mov_b32 s17, s9 +; GFX9-O0-NEXT: s_mov_b32 s18, s8 +; GFX9-O0-NEXT: s_mov_b32 s19, s3 +; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 +; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 ; GFX9-O0-NEXT: s_mov_b32 s3, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 8 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9 ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1319,13 +1287,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6 +; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7 +; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9 +; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4 +; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1355,12 +1323,12 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 @@ -1514,35 +1482,35 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: s_mov_b32 s8, s17 +; GFX9-O0-NEXT: s_mov_b32 s6, s19 +; GFX9-O0-NEXT: s_mov_b32 s7, s18 +; GFX9-O0-NEXT: s_mov_b32 s15, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s8 +; GFX9-O0-NEXT: s_mov_b32 s17, s15 ; GFX9-O0-NEXT: s_mov_b32 s18, s7 ; GFX9-O0-NEXT: s_mov_b32 s19, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9 +; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 +; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1579,12 +1547,12 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6 +; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7 +; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8 +; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1622,14 +1590,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: s_getpc_b64 s[2:3] @@ -1693,16 +1661,18 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s6, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s5 +; GFX9-O0-NEXT: s_mov_b32 s8, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 @@ -1711,21 +1681,25 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 @@ -1749,28 +1723,30 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 -; GFX9-O3-NEXT: s_mov_b32 s4, -1 -; GFX9-O3-NEXT: s_brev_b32 s5, -2 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_mov_b32 s6, -1 +; GFX9-O3-NEXT: s_brev_b32 s7, -2 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index b3ed7376a1ed..f73489b7db77 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -273,12 +273,15 @@ define amdgpu_cs void @wwm_reserved_regs(ptr addrspace(1) %ptr, <4 x i32> inreg %ld1 = load volatile i32, ptr addrspace(1) %ptr %inactive0 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld1, i32 0) %inactive1 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld0, i32 0) - store volatile i32 %inactive0, ptr addrspace(1) %ptr - store volatile i32 %inactive1, ptr addrspace(1) %ptr + %wwm0 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive0) + %wwm1 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive1) + store volatile i32 %wwm0, ptr addrspace(1) %ptr + store volatile i32 %wwm1, ptr addrspace(1) %ptr ret void } declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #6 +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #6 attributes #0 = { "no-signed-zeros-fp-math" = "true" } attributes #1 = { "amdgpu-dx10-clamp" = "false" }