[AArch64][SME2] Improve register allocation of multi-vector SME intrinsics (#116399)

The FORM_TRANSPOSED_REG_TUPLE pseudos have been created to
improve register allocation for intrinsics which use strided and
contiguous multi-vector registers, avoiding unnecessary copies.

If the operands of the pseudo are copies where the source register is in
the StridedOrContiguous class, the pseudo is used by
getRegAllocationHints
to suggest a contigious multi-vector register which matches the
subregister
sequence used by the operands.
If the operands do not match this pattern, the pseudos are expanded
to a REG_SEQUENCE.

Patch contains changes by Matthew Devereau.
This commit is contained in:
Kerry McLaughlin
2024-12-12 10:00:24 +00:00
committed by GitHub
parent 98470c0b2e
commit 5ca26d769d
7 changed files with 805 additions and 161 deletions

View File

@@ -67,6 +67,10 @@ private:
TargetRegisterClass ContiguousClass,
TargetRegisterClass StridedClass,
unsigned ContiguousOpc, unsigned StridedOpc);
bool expandFormTuplePseudo(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI,
unsigned Size);
bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned BitSize);
@@ -1142,6 +1146,32 @@ bool AArch64ExpandPseudo::expandMultiVecPseudo(
return true;
}
bool AArch64ExpandPseudo::expandFormTuplePseudo(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI, unsigned Size) {
assert(Size == 2 || Size == 4 && "Invalid Tuple Size");
MachineInstr &MI = *MBBI;
Register ReturnTuple = MI.getOperand(0).getReg();
const TargetRegisterInfo *TRI =
MBB.getParent()->getSubtarget().getRegisterInfo();
for (unsigned I = 0; I < Size; ++I) {
Register FormTupleOpReg = MI.getOperand(I + 1).getReg();
Register ReturnTupleSubReg =
TRI->getSubReg(ReturnTuple, AArch64::zsub0 + I);
// Add copies to ensure the subregisters remain in the correct order
// for any contigious operation they are used by.
if (FormTupleOpReg != ReturnTupleSubReg)
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORR_ZZZ))
.addReg(ReturnTupleSubReg, RegState::Define)
.addReg(FormTupleOpReg)
.addReg(FormTupleOpReg);
}
MI.eraseFromParent();
return true;
}
/// If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -1724,6 +1754,10 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return expandMultiVecPseudo(
MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED);
case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 2);
case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 4);
}
return false;
}

View File

@@ -8581,6 +8581,56 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
return ZExtBool;
}
// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
// input operands are copy nodes where the source register is in a
// StridedOrContiguous class. For example:
//
// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
//
bool shouldUseFormStridedPseudo(MachineInstr &MI) {
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
const TargetRegisterClass *RegClass = nullptr;
switch (MI.getOpcode()) {
case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
break;
case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
break;
default:
llvm_unreachable("Unexpected opcode.");
}
MCRegister SubReg = MCRegister::NoRegister;
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
MachineOperand &MO = MI.getOperand(I);
assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
MachineOperand *Def = MRI.getOneDef(MO.getReg());
if (!Def || !Def->getParent()->isCopy())
return false;
const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
unsigned OpSubReg = CopySrc.getSubReg();
if (SubReg == MCRegister::NoRegister)
SubReg = OpSubReg;
MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
return false;
}
return true;
}
void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
// Live-in physreg copies that are glued to SMSTART are applied as
@@ -8606,6 +8656,27 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
}
if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
// If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
// from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
if (shouldUseFormStridedPseudo(MI))
return;
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
TII->get(TargetOpcode::REG_SEQUENCE),
MI.getOperand(0).getReg());
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
MIB.add(MI.getOperand(I));
MIB.addImm(AArch64::zsub0 + (I - 1));
}
MI.eraseFromParent();
return;
}
// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
// have nothing to do with VG, were it not that they are used to materialise a
// frame-address. If they contain a frame-index to a scalable vector, this

View File

@@ -1081,6 +1081,58 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
}
// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
// where a consecutive multi-vector tuple is constructed from the same indices
// of multiple strided loads. This may still result in unnecessary copies
// between the loads and the tuple. Here we try to return a hint to assign the
// contiguous ZPRMulReg starting at the same register as the first operand of
// the pseudo, which should be a subregister of the first strided load.
//
// For example, if the first strided load has been assigned $z16_z20_z24_z28
// and the operands of the pseudo are each accessing subregister zsub2, we
// should look through through Order to find a contiguous register which
// begins with $z24 (i.e. $z24_z25_z26_z27).
//
bool AArch64RegisterInfo::getRegAllocationHints(
Register VirtReg, ArrayRef<MCPhysReg> Order,
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
MF, VRM);
unsigned FirstOpSubReg = MI.getOperand(1).getSubReg();
switch (FirstOpSubReg) {
case AArch64::zsub0:
case AArch64::zsub1:
case AArch64::zsub2:
case AArch64::zsub3:
break;
default:
continue;
}
// Look up the physical register mapped to the first operand of the pseudo.
Register FirstOpVirtReg = MI.getOperand(1).getReg();
if (!VRM->hasPhys(FirstOpVirtReg))
continue;
MCRegister TupleStartReg =
getSubReg(VRM->getPhys(FirstOpVirtReg), FirstOpSubReg);
for (unsigned I = 0; I < Order.size(); ++I)
if (MCRegister R = getSubReg(Order[I], AArch64::zsub0))
if (R == TupleStartReg)
Hints.push_back(Order[I]);
}
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
VRM);
}
unsigned AArch64RegisterInfo::getLocalAddressRegister(
const MachineFunction &MF) const {
const auto &MFI = MF.getFrameInfo();

View File

@@ -134,6 +134,11 @@ public:
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
SmallVectorImpl<MCPhysReg> &Hints,
const MachineFunction &MF, const VirtRegMap *VRM,
const LiveRegMatrix *Matrix) const override;
unsigned getLocalAddressRegister(const MachineFunction &MF) const;
bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const;

View File

@@ -35,6 +35,30 @@ def tileslicerange0s4 : ComplexPattern<i32, 2, "SelectSMETileSlice<0, 4>", []>;
let WantsRoot = true in
def am_sme_indexed_b4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0, 15>">;
// The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to
// improve register allocation for intrinsics which use strided and contiguous
// multi-vector registers, avoiding unnecessary copies.
// If the operands of the pseudo are copies where the source register is in
// the StridedOrContiguous class, the pseudo is used to provide a hint to the
// register allocator suggesting a contigious multi-vector register which
// matches the subregister sequence used by the operands.
// If the operands do not match this pattern, the pseudos are expanded
// to a REG_SEQUENCE using the post-isel hook.
def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO :
Pseudo<(outs ZPR2Mul2:$tup),
(ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
let hasSideEffects = 0;
let hasPostISelHook = 1;
}
def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
Pseudo<(outs ZPR4Mul4:$tup),
(ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
let hasSideEffects = 0;
let hasPostISelHook = 1;
}
def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
[SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
@@ -173,14 +197,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;
(FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
(FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
zpr_ty:$Zm, imm_ty:$i)>;
class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>

View File

@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
target triple="aarch64-linux-gnu"
@@ -26,18 +26,18 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z31.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z30.d, z3.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
; CHECK-NEXT: mov z28.d, z1.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -68,18 +68,18 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
; CHECK-LABEL: udot_multi_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z31.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z30.d, z3.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
; CHECK-NEXT: mov z28.d, z1.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: ret
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -110,18 +110,18 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: udot_multi_za64_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z31.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z30.d, z3.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
; CHECK-NEXT: mov z28.d, z1.d
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -152,18 +152,18 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z31.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z30.d, z3.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
; CHECK-NEXT: mov z28.d, z1.d
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: ret
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -197,18 +197,18 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z31.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z30.d, z3.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
; CHECK-NEXT: mov z28.d, z1.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -239,18 +239,18 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
; CHECK-LABEL: sdot_multi_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z31.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z30.d, z3.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
; CHECK-NEXT: mov z28.d, z1.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: ret
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -281,18 +281,18 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z31.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z30.d, z3.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
; CHECK-NEXT: mov z28.d, z1.d
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -309,9 +309,7 @@ define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
; CHECK-LABEL: udot_single_za32_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -324,11 +322,7 @@ define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
; CHECK-LABEL: udot_single_za32_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -341,9 +335,7 @@ define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: udot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -356,11 +348,7 @@ define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: udot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -373,9 +361,7 @@ define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: udot_single_za64_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -388,11 +374,7 @@ define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
; CHECK-LABEL: udot_single_za64_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -405,9 +387,7 @@ define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: usdot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -420,11 +400,7 @@ define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: usdot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -440,9 +416,7 @@ define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
; CHECK-LABEL: sdot_single_za32_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -455,11 +429,7 @@ define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
; CHECK-LABEL: sdot_single_za32_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -472,9 +442,7 @@ define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: sdot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -487,11 +455,7 @@ define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: sdot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -504,9 +468,7 @@ define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: sdot_single_za64_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -519,11 +481,7 @@ define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
; CHECK-LABEL: sdot_single_za64_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -536,9 +494,7 @@ define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: sudot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -551,11 +507,7 @@ define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @sudot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: sudot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -571,8 +523,8 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: udot_lane_za32_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: ret
@@ -585,11 +537,7 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
define void @udot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
; CHECK-LABEL: udot_lane_za32_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[3]
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[3]
; CHECK-NEXT: ret
@@ -605,8 +553,8 @@ define void @udot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: udot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -620,8 +568,8 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: udot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -635,12 +583,86 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
ret void
}
define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: udot_form_2x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
ret void
}
define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: udot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
%mul3 = shl i64 %stride, 1
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
%mul5 = mul i64 %stride, 3
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
ret void
}
define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: ret
@@ -654,8 +676,8 @@ define void @udot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: udot_lane_za64_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
@@ -673,8 +695,8 @@ define void @usdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: usdot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -688,8 +710,8 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: usdot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -703,6 +725,79 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
ret void
}
define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: usdot_form_2x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
ret void
}
define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: usdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
%mul3 = shl i64 %stride, 1
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
%mul5 = mul i64 %stride, 3
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
ret void
}
; == Multi, indexed (signed) ==
@@ -710,8 +805,8 @@ define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sdot_lane_za32_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: ret
@@ -725,8 +820,8 @@ define void @sdot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sdot_lane_za32_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
@@ -744,8 +839,8 @@ define void @sdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: sdot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -759,8 +854,8 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: sdot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -774,12 +869,86 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
ret void
}
define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: sdot_form_2x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
ret void
}
define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: sdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
%mul3 = shl i64 %stride, 1
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
%mul5 = mul i64 %stride, 3
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
ret void
}
define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: ret
@@ -793,8 +962,8 @@ define void @sdot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sdot_lane_za64_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
@@ -814,8 +983,8 @@ define void @sudot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sudot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -829,8 +998,8 @@ define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sudot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -844,11 +1013,84 @@ define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
ret void
}
define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: sudot_form_2x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
ret void
}
define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: sudot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
%mul3 = shl i64 %stride, 1
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
%mul5 = mul i64 %stride, 3
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
ret void
}
attributes #0 = { nounwind "target-features"="+sme2" }
attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
; == Multi, multi (unsigned)
declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)

View File

@@ -1,15 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s
; == FVDOT ==
define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) {
; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: fvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: fvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -25,9 +22,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half>
define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) {
; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: bfvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: bfvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -43,9 +38,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloa
define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_svdot_lane_za32_vg1x2_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: svdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -58,11 +51,7 @@ define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %
define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_svdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: svdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -75,11 +64,7 @@ define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %
define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_svdot_lane_za64_vg1x4_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: svdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: svdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: ret
@@ -89,15 +74,87 @@ define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
ret void
}
define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: svdot_form_2x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: add x9, x0, x1
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9]
; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0]
; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
%3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
%6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
ret void
}
define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: svdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
%mul3 = shl i64 %stride, 1
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
%mul5 = mul i64 %stride, 3
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
ret void
}
; == UVDOT ==
define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_uvdot_lane_za32_vg1x2_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: uvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -110,11 +167,7 @@ define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %
define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_uvdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: uvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -127,11 +180,7 @@ define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %
define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_uvdot_lane_za64_vg1x4_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: uvdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: uvdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: ret
@@ -141,17 +190,87 @@ define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
ret void
}
define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: uvdot_form_2x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: add x9, x0, x1
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9]
; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0]
; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
%3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
%6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
ret void
}
define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: uvdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
%mul3 = shl i64 %stride, 1
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
%mul5 = mul i64 %stride, 3
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
ret void
}
; == SUVDOT ==
define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_suvdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: suvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -161,17 +280,62 @@ define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
ret void
}
define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: suvdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
%mul3 = shl i64 %stride, 1
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
%mul5 = mul i64 %stride, 3
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
ret void
}
; == USVDOT ==
define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_usvdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: usvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -181,6 +345,58 @@ define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
ret void
}
define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: usvdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
%mul3 = shl i64 %stride, 1
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
%mul5 = mul i64 %stride, 3
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
ret void
}
attributes #0 = { nounwind "target-features"="+sme2" }
attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
; == FVDOT ==
declare void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)