[AArch64][SME2] Improve register allocation of multi-vector SME intrinsics (#116399)
The FORM_TRANSPOSED_REG_TUPLE pseudos have been created to improve register allocation for intrinsics which use strided and contiguous multi-vector registers, avoiding unnecessary copies. If the operands of the pseudo are copies where the source register is in the StridedOrContiguous class, the pseudo is used by getRegAllocationHints to suggest a contigious multi-vector register which matches the subregister sequence used by the operands. If the operands do not match this pattern, the pseudos are expanded to a REG_SEQUENCE. Patch contains changes by Matthew Devereau.
This commit is contained in:
@@ -67,6 +67,10 @@ private:
|
||||
TargetRegisterClass ContiguousClass,
|
||||
TargetRegisterClass StridedClass,
|
||||
unsigned ContiguousOpc, unsigned StridedOpc);
|
||||
bool expandFormTuplePseudo(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator MBBI,
|
||||
MachineBasicBlock::iterator &NextMBBI,
|
||||
unsigned Size);
|
||||
bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
|
||||
unsigned BitSize);
|
||||
|
||||
@@ -1142,6 +1146,32 @@ bool AArch64ExpandPseudo::expandMultiVecPseudo(
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AArch64ExpandPseudo::expandFormTuplePseudo(
|
||||
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
|
||||
MachineBasicBlock::iterator &NextMBBI, unsigned Size) {
|
||||
assert(Size == 2 || Size == 4 && "Invalid Tuple Size");
|
||||
MachineInstr &MI = *MBBI;
|
||||
Register ReturnTuple = MI.getOperand(0).getReg();
|
||||
|
||||
const TargetRegisterInfo *TRI =
|
||||
MBB.getParent()->getSubtarget().getRegisterInfo();
|
||||
for (unsigned I = 0; I < Size; ++I) {
|
||||
Register FormTupleOpReg = MI.getOperand(I + 1).getReg();
|
||||
Register ReturnTupleSubReg =
|
||||
TRI->getSubReg(ReturnTuple, AArch64::zsub0 + I);
|
||||
// Add copies to ensure the subregisters remain in the correct order
|
||||
// for any contigious operation they are used by.
|
||||
if (FormTupleOpReg != ReturnTupleSubReg)
|
||||
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORR_ZZZ))
|
||||
.addReg(ReturnTupleSubReg, RegState::Define)
|
||||
.addReg(FormTupleOpReg)
|
||||
.addReg(FormTupleOpReg);
|
||||
}
|
||||
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
/// If MBBI references a pseudo instruction that should be expanded here,
|
||||
/// do the expansion and return true. Otherwise return false.
|
||||
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
|
||||
@@ -1724,6 +1754,10 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
|
||||
return expandMultiVecPseudo(
|
||||
MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
|
||||
AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED);
|
||||
case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
|
||||
return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 2);
|
||||
case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
|
||||
return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 4);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -8581,6 +8581,56 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
|
||||
return ZExtBool;
|
||||
}
|
||||
|
||||
// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
|
||||
// input operands are copy nodes where the source register is in a
|
||||
// StridedOrContiguous class. For example:
|
||||
//
|
||||
// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
|
||||
// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
|
||||
// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
|
||||
// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
|
||||
// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
|
||||
// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
|
||||
// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
|
||||
//
|
||||
bool shouldUseFormStridedPseudo(MachineInstr &MI) {
|
||||
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
|
||||
|
||||
const TargetRegisterClass *RegClass = nullptr;
|
||||
switch (MI.getOpcode()) {
|
||||
case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
|
||||
RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
|
||||
break;
|
||||
case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
|
||||
RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("Unexpected opcode.");
|
||||
}
|
||||
|
||||
MCRegister SubReg = MCRegister::NoRegister;
|
||||
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
|
||||
MachineOperand &MO = MI.getOperand(I);
|
||||
assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
|
||||
|
||||
MachineOperand *Def = MRI.getOneDef(MO.getReg());
|
||||
if (!Def || !Def->getParent()->isCopy())
|
||||
return false;
|
||||
|
||||
const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
|
||||
unsigned OpSubReg = CopySrc.getSubReg();
|
||||
if (SubReg == MCRegister::NoRegister)
|
||||
SubReg = OpSubReg;
|
||||
|
||||
MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
|
||||
if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
|
||||
MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
|
||||
SDNode *Node) const {
|
||||
// Live-in physreg copies that are glued to SMSTART are applied as
|
||||
@@ -8606,6 +8656,27 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
|
||||
}
|
||||
}
|
||||
|
||||
if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
|
||||
MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
|
||||
// If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
|
||||
// from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
|
||||
if (shouldUseFormStridedPseudo(MI))
|
||||
return;
|
||||
|
||||
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
|
||||
MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
|
||||
TII->get(TargetOpcode::REG_SEQUENCE),
|
||||
MI.getOperand(0).getReg());
|
||||
|
||||
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
|
||||
MIB.add(MI.getOperand(I));
|
||||
MIB.addImm(AArch64::zsub0 + (I - 1));
|
||||
}
|
||||
|
||||
MI.eraseFromParent();
|
||||
return;
|
||||
}
|
||||
|
||||
// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
|
||||
// have nothing to do with VG, were it not that they are used to materialise a
|
||||
// frame-address. If they contain a frame-index to a scalable vector, this
|
||||
|
||||
@@ -1081,6 +1081,58 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
|
||||
}
|
||||
}
|
||||
|
||||
// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
|
||||
// where a consecutive multi-vector tuple is constructed from the same indices
|
||||
// of multiple strided loads. This may still result in unnecessary copies
|
||||
// between the loads and the tuple. Here we try to return a hint to assign the
|
||||
// contiguous ZPRMulReg starting at the same register as the first operand of
|
||||
// the pseudo, which should be a subregister of the first strided load.
|
||||
//
|
||||
// For example, if the first strided load has been assigned $z16_z20_z24_z28
|
||||
// and the operands of the pseudo are each accessing subregister zsub2, we
|
||||
// should look through through Order to find a contiguous register which
|
||||
// begins with $z24 (i.e. $z24_z25_z26_z27).
|
||||
//
|
||||
bool AArch64RegisterInfo::getRegAllocationHints(
|
||||
Register VirtReg, ArrayRef<MCPhysReg> Order,
|
||||
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
|
||||
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
|
||||
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
|
||||
for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
|
||||
if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
|
||||
MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)
|
||||
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
|
||||
MF, VRM);
|
||||
|
||||
unsigned FirstOpSubReg = MI.getOperand(1).getSubReg();
|
||||
switch (FirstOpSubReg) {
|
||||
case AArch64::zsub0:
|
||||
case AArch64::zsub1:
|
||||
case AArch64::zsub2:
|
||||
case AArch64::zsub3:
|
||||
break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
|
||||
// Look up the physical register mapped to the first operand of the pseudo.
|
||||
Register FirstOpVirtReg = MI.getOperand(1).getReg();
|
||||
if (!VRM->hasPhys(FirstOpVirtReg))
|
||||
continue;
|
||||
|
||||
MCRegister TupleStartReg =
|
||||
getSubReg(VRM->getPhys(FirstOpVirtReg), FirstOpSubReg);
|
||||
for (unsigned I = 0; I < Order.size(); ++I)
|
||||
if (MCRegister R = getSubReg(Order[I], AArch64::zsub0))
|
||||
if (R == TupleStartReg)
|
||||
Hints.push_back(Order[I]);
|
||||
}
|
||||
|
||||
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
|
||||
VRM);
|
||||
}
|
||||
|
||||
unsigned AArch64RegisterInfo::getLocalAddressRegister(
|
||||
const MachineFunction &MF) const {
|
||||
const auto &MFI = MF.getFrameInfo();
|
||||
|
||||
@@ -134,6 +134,11 @@ public:
|
||||
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
|
||||
MachineFunction &MF) const override;
|
||||
|
||||
bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
|
||||
SmallVectorImpl<MCPhysReg> &Hints,
|
||||
const MachineFunction &MF, const VirtRegMap *VRM,
|
||||
const LiveRegMatrix *Matrix) const override;
|
||||
|
||||
unsigned getLocalAddressRegister(const MachineFunction &MF) const;
|
||||
bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const;
|
||||
|
||||
|
||||
@@ -35,6 +35,30 @@ def tileslicerange0s4 : ComplexPattern<i32, 2, "SelectSMETileSlice<0, 4>", []>;
|
||||
let WantsRoot = true in
|
||||
def am_sme_indexed_b4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0, 15>">;
|
||||
|
||||
// The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to
|
||||
// improve register allocation for intrinsics which use strided and contiguous
|
||||
// multi-vector registers, avoiding unnecessary copies.
|
||||
// If the operands of the pseudo are copies where the source register is in
|
||||
// the StridedOrContiguous class, the pseudo is used to provide a hint to the
|
||||
// register allocator suggesting a contigious multi-vector register which
|
||||
// matches the subregister sequence used by the operands.
|
||||
// If the operands do not match this pattern, the pseudos are expanded
|
||||
// to a REG_SEQUENCE using the post-isel hook.
|
||||
|
||||
def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO :
|
||||
Pseudo<(outs ZPR2Mul2:$tup),
|
||||
(ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
|
||||
let hasSideEffects = 0;
|
||||
let hasPostISelHook = 1;
|
||||
}
|
||||
|
||||
def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
|
||||
Pseudo<(outs ZPR4Mul4:$tup),
|
||||
(ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
|
||||
let hasSideEffects = 0;
|
||||
let hasPostISelHook = 1;
|
||||
}
|
||||
|
||||
def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
|
||||
def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
|
||||
[SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
|
||||
@@ -173,14 +197,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic
|
||||
Operand imm_ty, ComplexPattern tileslice>
|
||||
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
|
||||
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
|
||||
(REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;
|
||||
(FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
|
||||
|
||||
class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
|
||||
Operand imm_ty, ComplexPattern tileslice>
|
||||
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
|
||||
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
|
||||
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
|
||||
(REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
|
||||
(FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
|
||||
zpr_ty:$Zm, imm_ty:$i)>;
|
||||
|
||||
class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
|
||||
; RUN: llc -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
target triple="aarch64-linux-gnu"
|
||||
|
||||
@@ -26,18 +26,18 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
|
||||
define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
|
||||
; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z31.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: ptrue p0.h
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z25.d, z6.d
|
||||
; CHECK-NEXT: mov z30.d, z3.d
|
||||
; CHECK-NEXT: mov z7.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z24.d, z5.d
|
||||
; CHECK-NEXT: mov z29.d, z2.d
|
||||
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
|
||||
; CHECK-NEXT: mov z28.d, z1.d
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: mov z6.d, z3.d
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: ret
|
||||
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
|
||||
call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
|
||||
@@ -68,18 +68,18 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
|
||||
; CHECK-LABEL: udot_multi_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z31.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: ptrue p0.b
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z25.d, z6.d
|
||||
; CHECK-NEXT: mov z30.d, z3.d
|
||||
; CHECK-NEXT: mov z7.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z24.d, z5.d
|
||||
; CHECK-NEXT: mov z29.d, z2.d
|
||||
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
|
||||
; CHECK-NEXT: mov z28.d, z1.d
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: mov z6.d, z3.d
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: ret
|
||||
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
|
||||
call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
|
||||
@@ -110,18 +110,18 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
|
||||
define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
|
||||
; CHECK-LABEL: udot_multi_za64_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z31.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: ptrue p0.h
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z25.d, z6.d
|
||||
; CHECK-NEXT: mov z30.d, z3.d
|
||||
; CHECK-NEXT: mov z7.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z24.d, z5.d
|
||||
; CHECK-NEXT: mov z29.d, z2.d
|
||||
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
|
||||
; CHECK-NEXT: mov z28.d, z1.d
|
||||
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: mov z6.d, z3.d
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: ret
|
||||
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
|
||||
call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
|
||||
@@ -152,18 +152,18 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
|
||||
define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
|
||||
; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z31.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: ptrue p0.b
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z25.d, z6.d
|
||||
; CHECK-NEXT: mov z30.d, z3.d
|
||||
; CHECK-NEXT: mov z7.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z24.d, z5.d
|
||||
; CHECK-NEXT: mov z29.d, z2.d
|
||||
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
|
||||
; CHECK-NEXT: mov z28.d, z1.d
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: mov z6.d, z3.d
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: ret
|
||||
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
|
||||
call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
|
||||
@@ -197,18 +197,18 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
|
||||
define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
|
||||
; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z31.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: ptrue p0.h
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z25.d, z6.d
|
||||
; CHECK-NEXT: mov z30.d, z3.d
|
||||
; CHECK-NEXT: mov z7.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z24.d, z5.d
|
||||
; CHECK-NEXT: mov z29.d, z2.d
|
||||
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
|
||||
; CHECK-NEXT: mov z28.d, z1.d
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: mov z6.d, z3.d
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: ret
|
||||
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
|
||||
call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
|
||||
@@ -239,18 +239,18 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
|
||||
; CHECK-LABEL: sdot_multi_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z31.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: ptrue p0.b
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z25.d, z6.d
|
||||
; CHECK-NEXT: mov z30.d, z3.d
|
||||
; CHECK-NEXT: mov z7.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z24.d, z5.d
|
||||
; CHECK-NEXT: mov z29.d, z2.d
|
||||
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
|
||||
; CHECK-NEXT: mov z28.d, z1.d
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: mov z6.d, z3.d
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
|
||||
; CHECK-NEXT: ret
|
||||
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
|
||||
call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
|
||||
@@ -281,18 +281,18 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
|
||||
define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
|
||||
; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z31.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: ptrue p0.h
|
||||
; CHECK-NEXT: mov z26.d, z7.d
|
||||
; CHECK-NEXT: mov z25.d, z6.d
|
||||
; CHECK-NEXT: mov z30.d, z3.d
|
||||
; CHECK-NEXT: mov z7.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z24.d, z5.d
|
||||
; CHECK-NEXT: mov z29.d, z2.d
|
||||
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
|
||||
; CHECK-NEXT: mov z28.d, z1.d
|
||||
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: mov z6.d, z3.d
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
|
||||
; CHECK-NEXT: ret
|
||||
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
|
||||
call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
|
||||
@@ -309,9 +309,7 @@ define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
|
||||
define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
|
||||
; CHECK-LABEL: udot_single_za32_u16_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
|
||||
; CHECK-NEXT: ret
|
||||
@@ -324,11 +322,7 @@ define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
|
||||
; CHECK-LABEL: udot_single_za32_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
|
||||
; CHECK-NEXT: ret
|
||||
@@ -341,9 +335,7 @@ define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
|
||||
; CHECK-LABEL: udot_single_za32_u8_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
|
||||
; CHECK-NEXT: ret
|
||||
@@ -356,11 +348,7 @@ define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
|
||||
define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
|
||||
; CHECK-LABEL: udot_single_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
|
||||
; CHECK-NEXT: ret
|
||||
@@ -373,9 +361,7 @@ define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
|
||||
define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
|
||||
; CHECK-LABEL: udot_single_za64_u16_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
|
||||
; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
|
||||
; CHECK-NEXT: ret
|
||||
@@ -388,11 +374,7 @@ define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
|
||||
; CHECK-LABEL: udot_single_za64_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
|
||||
; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
|
||||
; CHECK-NEXT: ret
|
||||
@@ -405,9 +387,7 @@ define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
|
||||
; CHECK-LABEL: usdot_single_za32_u8_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
|
||||
; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
|
||||
; CHECK-NEXT: ret
|
||||
@@ -420,11 +400,7 @@ define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
|
||||
; CHECK-LABEL: usdot_single_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
|
||||
; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
|
||||
; CHECK-NEXT: ret
|
||||
@@ -440,9 +416,7 @@ define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
|
||||
; CHECK-LABEL: sdot_single_za32_u16_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
|
||||
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
|
||||
; CHECK-NEXT: ret
|
||||
@@ -455,11 +429,7 @@ define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
|
||||
; CHECK-LABEL: sdot_single_za32_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
|
||||
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
|
||||
; CHECK-NEXT: ret
|
||||
@@ -472,9 +442,7 @@ define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
|
||||
; CHECK-LABEL: sdot_single_za32_u8_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
|
||||
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
|
||||
; CHECK-NEXT: ret
|
||||
@@ -487,11 +455,7 @@ define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
|
||||
define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
|
||||
; CHECK-LABEL: sdot_single_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
|
||||
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
|
||||
; CHECK-NEXT: ret
|
||||
@@ -504,9 +468,7 @@ define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
|
||||
define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
|
||||
; CHECK-LABEL: sdot_single_za64_u16_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
|
||||
; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
|
||||
; CHECK-NEXT: ret
|
||||
@@ -519,11 +481,7 @@ define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
|
||||
; CHECK-LABEL: sdot_single_za64_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
|
||||
; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
|
||||
; CHECK-NEXT: ret
|
||||
@@ -536,9 +494,7 @@ define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
|
||||
; CHECK-LABEL: sudot_single_za32_u8_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
|
||||
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
|
||||
; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
|
||||
; CHECK-NEXT: ret
|
||||
@@ -551,11 +507,7 @@ define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
|
||||
define void @sudot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
|
||||
; CHECK-LABEL: sudot_single_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
|
||||
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
|
||||
; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
|
||||
; CHECK-NEXT: ret
|
||||
@@ -571,8 +523,8 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
; CHECK-LABEL: udot_lane_za32_u16_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -585,11 +537,7 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
define void @udot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
|
||||
; CHECK-LABEL: udot_lane_za32_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[3]
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -605,8 +553,8 @@ define void @udot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vs
|
||||
; CHECK-LABEL: udot_lane_za32_u8_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
|
||||
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -620,8 +568,8 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
|
||||
; CHECK-LABEL: udot_lane_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z27.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z26.d, z3.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z25.d, z2.d
|
||||
; CHECK-NEXT: mov z24.d, z1.d
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
|
||||
@@ -635,12 +583,86 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: udot_form_2x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
|
||||
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
|
||||
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: udot_form_4x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: lsl x9, x1, #1
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: add x10, x9, x1
|
||||
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
|
||||
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
|
||||
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
|
||||
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
|
||||
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
|
||||
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
|
||||
%mul3 = shl i64 %stride, 1
|
||||
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
|
||||
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
|
||||
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
|
||||
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
|
||||
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
|
||||
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
|
||||
%mul5 = mul i64 %stride, 3
|
||||
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
|
||||
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
|
||||
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
|
||||
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
|
||||
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
|
||||
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
|
||||
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
|
||||
; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
|
||||
; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -654,8 +676,8 @@ define void @udot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
; CHECK-LABEL: udot_lane_za64_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z27.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z26.d, z3.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z25.d, z2.d
|
||||
; CHECK-NEXT: mov z24.d, z1.d
|
||||
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
|
||||
@@ -673,8 +695,8 @@ define void @usdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
; CHECK-LABEL: usdot_lane_za32_u8_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
|
||||
; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -688,8 +710,8 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
; CHECK-LABEL: usdot_lane_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z27.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z26.d, z3.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z25.d, z2.d
|
||||
; CHECK-NEXT: mov z24.d, z1.d
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
|
||||
@@ -703,6 +725,79 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: usdot_form_2x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
|
||||
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
|
||||
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: usdot_form_4x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: lsl x9, x1, #1
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: add x10, x9, x1
|
||||
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
|
||||
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
|
||||
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
|
||||
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
|
||||
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
|
||||
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
|
||||
%mul3 = shl i64 %stride, 1
|
||||
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
|
||||
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
|
||||
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
|
||||
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
|
||||
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
|
||||
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
|
||||
%mul5 = mul i64 %stride, 3
|
||||
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
|
||||
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
|
||||
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
|
||||
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
|
||||
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
|
||||
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
|
||||
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; == Multi, indexed (signed) ==
|
||||
|
||||
@@ -710,8 +805,8 @@ define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
; CHECK-LABEL: sdot_lane_za32_u16_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
|
||||
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -725,8 +820,8 @@ define void @sdot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
; CHECK-LABEL: sdot_lane_za32_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z27.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z26.d, z3.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z25.d, z2.d
|
||||
; CHECK-NEXT: mov z24.d, z1.d
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
|
||||
@@ -744,8 +839,8 @@ define void @sdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vs
|
||||
; CHECK-LABEL: sdot_lane_za32_u8_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
|
||||
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -759,8 +854,8 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
|
||||
; CHECK-LABEL: sdot_lane_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z27.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z26.d, z3.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z25.d, z2.d
|
||||
; CHECK-NEXT: mov z24.d, z1.d
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
|
||||
@@ -774,12 +869,86 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: sdot_form_2x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
|
||||
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
|
||||
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: sdot_form_4x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: lsl x9, x1, #1
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: add x10, x9, x1
|
||||
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
|
||||
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
|
||||
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
|
||||
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
|
||||
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
|
||||
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
|
||||
%mul3 = shl i64 %stride, 1
|
||||
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
|
||||
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
|
||||
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
|
||||
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
|
||||
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
|
||||
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
|
||||
%mul5 = mul i64 %stride, 3
|
||||
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
|
||||
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
|
||||
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
|
||||
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
|
||||
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
|
||||
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
|
||||
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
|
||||
; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
|
||||
; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -793,8 +962,8 @@ define void @sdot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
; CHECK-LABEL: sdot_lane_za64_u16_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z27.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z26.d, z3.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z25.d, z2.d
|
||||
; CHECK-NEXT: mov z24.d, z1.d
|
||||
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
|
||||
@@ -814,8 +983,8 @@ define void @sudot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
; CHECK-LABEL: sudot_lane_za32_u8_vg1x2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z5.d, z2.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z4.d, z1.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
|
||||
; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -829,8 +998,8 @@ define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
; CHECK-LABEL: sudot_lane_za32_u8_vg1x4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov z27.d, z4.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z26.d, z3.d
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: mov z25.d, z2.d
|
||||
; CHECK-NEXT: mov z24.d, z1.d
|
||||
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
|
||||
@@ -844,11 +1013,84 @@ define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: sudot_form_2x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
|
||||
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
|
||||
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
|
||||
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: sudot_form_4x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: lsl x9, x1, #1
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: add x10, x9, x1
|
||||
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
|
||||
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
|
||||
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
|
||||
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
|
||||
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
|
||||
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
|
||||
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
|
||||
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
|
||||
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
|
||||
%mul3 = shl i64 %stride, 1
|
||||
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
|
||||
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
|
||||
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
|
||||
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
|
||||
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
|
||||
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
|
||||
%mul5 = mul i64 %stride, 3
|
||||
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
|
||||
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
|
||||
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
|
||||
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
|
||||
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
|
||||
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
|
||||
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
attributes #0 = { nounwind "target-features"="+sme2" }
|
||||
attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
|
||||
|
||||
|
||||
; == Multi, multi (unsigned)
|
||||
|
||||
declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
|
||||
|
||||
@@ -1,15 +1,12 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; == FVDOT ==
|
||||
|
||||
define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) {
|
||||
; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8f16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
|
||||
; CHECK-NEXT: fvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
|
||||
; CHECK-NEXT: fvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -25,9 +22,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half>
|
||||
define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) {
|
||||
; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
|
||||
; CHECK-NEXT: bfvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
|
||||
; CHECK-NEXT: bfvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -43,9 +38,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloa
|
||||
define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
|
||||
; CHECK-LABEL: test_svdot_lane_za32_vg1x2_nxv8i16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
|
||||
; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
|
||||
; CHECK-NEXT: svdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -58,11 +51,7 @@ define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %
|
||||
define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
|
||||
; CHECK-LABEL: test_svdot_lane_za32_vg1x4_nxv16i8:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
|
||||
; CHECK-NEXT: svdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -75,11 +64,7 @@ define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %
|
||||
define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) {
|
||||
; CHECK-LABEL: test_svdot_lane_za64_vg1x4_nxv8i16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: svdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1]
|
||||
; CHECK-NEXT: svdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -89,15 +74,87 @@ define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: svdot_form_2x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: add x9, x0, x1
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9]
|
||||
; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0]
|
||||
; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
|
||||
%3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
|
||||
%6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
|
||||
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: svdot_form_4x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: lsl x9, x1, #1
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: add x10, x9, x1
|
||||
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
|
||||
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
|
||||
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
|
||||
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
|
||||
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
|
||||
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
|
||||
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
|
||||
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
|
||||
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
|
||||
%mul3 = shl i64 %stride, 1
|
||||
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
|
||||
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
|
||||
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
|
||||
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
|
||||
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
|
||||
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
|
||||
%mul5 = mul i64 %stride, 3
|
||||
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
|
||||
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
|
||||
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
|
||||
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
|
||||
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
|
||||
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
|
||||
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; == UVDOT ==
|
||||
|
||||
define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
|
||||
; CHECK-LABEL: test_uvdot_lane_za32_vg1x2_nxv8i16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
|
||||
; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
|
||||
; CHECK-NEXT: uvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -110,11 +167,7 @@ define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %
|
||||
define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
|
||||
; CHECK-LABEL: test_uvdot_lane_za32_vg1x4_nxv16i8:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
|
||||
; CHECK-NEXT: uvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -127,11 +180,7 @@ define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %
|
||||
define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) {
|
||||
; CHECK-LABEL: test_uvdot_lane_za64_vg1x4_nxv8i16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: uvdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1]
|
||||
; CHECK-NEXT: uvdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -141,17 +190,87 @@ define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: uvdot_form_2x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: add x9, x0, x1
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9]
|
||||
; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0]
|
||||
; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
|
||||
%3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
|
||||
%6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
|
||||
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: uvdot_form_4x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: lsl x9, x1, #1
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: add x10, x9, x1
|
||||
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
|
||||
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
|
||||
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
|
||||
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
|
||||
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
|
||||
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
|
||||
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
|
||||
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
|
||||
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
|
||||
%mul3 = shl i64 %stride, 1
|
||||
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
|
||||
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
|
||||
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
|
||||
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
|
||||
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
|
||||
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
|
||||
%mul5 = mul i64 %stride, 3
|
||||
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
|
||||
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
|
||||
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
|
||||
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
|
||||
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
|
||||
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
|
||||
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; == SUVDOT ==
|
||||
|
||||
define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
|
||||
; CHECK-LABEL: test_suvdot_lane_za32_vg1x4_nxv16i8:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
|
||||
; CHECK-NEXT: suvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -161,17 +280,62 @@ define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: suvdot_form_4x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: lsl x9, x1, #1
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: add x10, x9, x1
|
||||
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
|
||||
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
|
||||
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
|
||||
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
|
||||
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
|
||||
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
|
||||
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
|
||||
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
|
||||
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
|
||||
%mul3 = shl i64 %stride, 1
|
||||
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
|
||||
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
|
||||
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
|
||||
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
|
||||
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
|
||||
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
|
||||
%mul5 = mul i64 %stride, 3
|
||||
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
|
||||
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
|
||||
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
|
||||
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
|
||||
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
|
||||
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
|
||||
tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; == USVDOT ==
|
||||
|
||||
define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
|
||||
; CHECK-LABEL: test_usvdot_lane_za32_vg1x4_nxv16i8:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: mov w8, w0
|
||||
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
|
||||
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
|
||||
; CHECK-NEXT: usvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
|
||||
; CHECK-NEXT: ret
|
||||
@@ -181,6 +345,58 @@ define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
|
||||
; CHECK-LABEL: usvdot_form_4x_tuple:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: lsl x9, x1, #1
|
||||
; CHECK-NEXT: ptrue pn8.b
|
||||
; CHECK-NEXT: mov w8, wzr
|
||||
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
|
||||
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
|
||||
; CHECK-NEXT: add x10, x9, x1
|
||||
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
|
||||
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
|
||||
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
|
||||
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
|
||||
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
|
||||
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
|
||||
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
|
||||
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
|
||||
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
|
||||
%4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
|
||||
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
|
||||
%6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
|
||||
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
|
||||
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
|
||||
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
|
||||
%10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
|
||||
%mul3 = shl i64 %stride, 1
|
||||
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
|
||||
%11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
|
||||
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
|
||||
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
|
||||
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
|
||||
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
|
||||
%mul5 = mul i64 %stride, 3
|
||||
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
|
||||
%16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
|
||||
%17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
|
||||
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
|
||||
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
|
||||
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
|
||||
tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
|
||||
tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "target-features"="+sme2" }
|
||||
attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
|
||||
|
||||
; == FVDOT ==
|
||||
declare void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
|
||||
|
||||
Reference in New Issue
Block a user