[AArch64] Spare N2I roundtrip when splatting float comparison (#141806)
Transform `select_cc t1, t2, -1, 0` for floats into a vector comparison which generates a mask, which is later on combined with potential vectorized DUPs.
This commit is contained in:
@@ -11048,10 +11048,126 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
|
||||
Cmp.getValue(1));
|
||||
}
|
||||
|
||||
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
|
||||
SDValue RHS, SDValue TVal,
|
||||
SDValue FVal, const SDLoc &dl,
|
||||
SelectionDAG &DAG) const {
|
||||
/// Emit vector comparison for floating-point values, producing a mask.
|
||||
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS,
|
||||
AArch64CC::CondCode CC, bool NoNans, EVT VT,
|
||||
const SDLoc &DL, SelectionDAG &DAG) {
|
||||
EVT SrcVT = LHS.getValueType();
|
||||
assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
|
||||
"function only supposed to emit natural comparisons");
|
||||
|
||||
switch (CC) {
|
||||
default:
|
||||
return SDValue();
|
||||
case AArch64CC::NE: {
|
||||
SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
|
||||
// Use vector semantics for the inversion to potentially save a copy between
|
||||
// SIMD and regular registers.
|
||||
if (!LHS.getValueType().isVector()) {
|
||||
EVT VecVT =
|
||||
EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
|
||||
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
|
||||
SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
|
||||
DAG.getUNDEF(VecVT), Fcmeq, Zero);
|
||||
SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
|
||||
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
|
||||
}
|
||||
return DAG.getNOT(DL, Fcmeq, VT);
|
||||
}
|
||||
case AArch64CC::EQ:
|
||||
return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
|
||||
case AArch64CC::GE:
|
||||
return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
|
||||
case AArch64CC::GT:
|
||||
return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
|
||||
case AArch64CC::LE:
|
||||
if (!NoNans)
|
||||
return SDValue();
|
||||
// If we ignore NaNs then we can use to the LS implementation.
|
||||
[[fallthrough]];
|
||||
case AArch64CC::LS:
|
||||
return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
|
||||
case AArch64CC::LT:
|
||||
if (!NoNans)
|
||||
return SDValue();
|
||||
// If we ignore NaNs then we can use to the MI implementation.
|
||||
[[fallthrough]];
|
||||
case AArch64CC::MI:
|
||||
return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
|
||||
}
|
||||
}
|
||||
|
||||
/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
|
||||
/// values are scalars, try to emit a mask generating vector instruction.
|
||||
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal,
|
||||
SDValue FVal, ISD::CondCode CC, bool NoNaNs,
|
||||
const SDLoc &DL, SelectionDAG &DAG) {
|
||||
assert(!LHS.getValueType().isVector());
|
||||
assert(!RHS.getValueType().isVector());
|
||||
|
||||
auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
|
||||
auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
|
||||
if (!CTVal || !CFVal)
|
||||
return {};
|
||||
if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
|
||||
!(CTVal->isZero() && CFVal->isAllOnes()))
|
||||
return {};
|
||||
|
||||
if (CTVal->isZero())
|
||||
CC = ISD::getSetCCInverse(CC, LHS.getValueType());
|
||||
|
||||
EVT VT = TVal.getValueType();
|
||||
if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
|
||||
return {};
|
||||
|
||||
if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
|
||||
bool OneNaN = false;
|
||||
if (LHS == RHS) {
|
||||
OneNaN = true;
|
||||
} else if (DAG.isKnownNeverNaN(RHS)) {
|
||||
OneNaN = true;
|
||||
RHS = LHS;
|
||||
} else if (DAG.isKnownNeverNaN(LHS)) {
|
||||
OneNaN = true;
|
||||
LHS = RHS;
|
||||
}
|
||||
if (OneNaN)
|
||||
CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
|
||||
}
|
||||
|
||||
AArch64CC::CondCode CC1;
|
||||
AArch64CC::CondCode CC2;
|
||||
bool ShouldInvert = false;
|
||||
changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
|
||||
SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
|
||||
SDValue Cmp2;
|
||||
if (CC2 != AArch64CC::AL) {
|
||||
Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
|
||||
if (!Cmp2)
|
||||
return {};
|
||||
}
|
||||
if (!Cmp2 && !ShouldInvert)
|
||||
return Cmp;
|
||||
|
||||
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
|
||||
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
|
||||
Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
|
||||
Zero);
|
||||
if (Cmp2) {
|
||||
Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
|
||||
Cmp2, Zero);
|
||||
Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
|
||||
}
|
||||
if (ShouldInvert)
|
||||
Cmp = DAG.getNOT(DL, Cmp, VecVT);
|
||||
Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
|
||||
return Cmp;
|
||||
}
|
||||
|
||||
SDValue AArch64TargetLowering::LowerSELECT_CC(
|
||||
ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
|
||||
iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs,
|
||||
const SDLoc &dl, SelectionDAG &DAG) const {
|
||||
// Handle f128 first, because it will result in a comparison of some RTLIB
|
||||
// call result against zero.
|
||||
if (LHS.getValueType() == MVT::f128) {
|
||||
@@ -11234,6 +11350,27 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
|
||||
LHS.getValueType() == MVT::f64);
|
||||
assert(LHS.getValueType() == RHS.getValueType());
|
||||
EVT VT = TVal.getValueType();
|
||||
|
||||
// If the purpose of the comparison is to select between all ones
|
||||
// or all zeros, try to use a vector comparison because the operands are
|
||||
// already stored in SIMD registers.
|
||||
if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
|
||||
switch (U->getOpcode()) {
|
||||
default:
|
||||
return false;
|
||||
case ISD::INSERT_VECTOR_ELT:
|
||||
case ISD::SCALAR_TO_VECTOR:
|
||||
case AArch64ISD::DUP:
|
||||
return true;
|
||||
}
|
||||
})) {
|
||||
bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs;
|
||||
SDValue VectorCmp =
|
||||
emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, dl, DAG);
|
||||
if (VectorCmp)
|
||||
return VectorCmp;
|
||||
}
|
||||
|
||||
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
|
||||
|
||||
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
|
||||
@@ -11320,8 +11457,10 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
|
||||
SDValue RHS = Op.getOperand(1);
|
||||
SDValue TVal = Op.getOperand(2);
|
||||
SDValue FVal = Op.getOperand(3);
|
||||
bool HasNoNans = Op->getFlags().hasNoNaNs();
|
||||
SDLoc DL(Op);
|
||||
return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
|
||||
return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL,
|
||||
DAG);
|
||||
}
|
||||
|
||||
SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
|
||||
@@ -11329,6 +11468,7 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
|
||||
SDValue CCVal = Op->getOperand(0);
|
||||
SDValue TVal = Op->getOperand(1);
|
||||
SDValue FVal = Op->getOperand(2);
|
||||
bool HasNoNans = Op->getFlags().hasNoNaNs();
|
||||
SDLoc DL(Op);
|
||||
|
||||
EVT Ty = Op.getValueType();
|
||||
@@ -11395,7 +11535,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
|
||||
DAG.getUNDEF(MVT::f32), FVal);
|
||||
}
|
||||
|
||||
SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
|
||||
SDValue Res =
|
||||
LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG);
|
||||
|
||||
if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
|
||||
return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
|
||||
@@ -15648,47 +15789,6 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
|
||||
llvm_unreachable("unexpected shift opcode");
|
||||
}
|
||||
|
||||
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
|
||||
AArch64CC::CondCode CC, bool NoNans, EVT VT,
|
||||
const SDLoc &dl, SelectionDAG &DAG) {
|
||||
EVT SrcVT = LHS.getValueType();
|
||||
assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
|
||||
"function only supposed to emit natural comparisons");
|
||||
|
||||
if (SrcVT.getVectorElementType().isFloatingPoint()) {
|
||||
switch (CC) {
|
||||
default:
|
||||
return SDValue();
|
||||
case AArch64CC::NE: {
|
||||
SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
|
||||
return DAG.getNOT(dl, Fcmeq, VT);
|
||||
}
|
||||
case AArch64CC::EQ:
|
||||
return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
|
||||
case AArch64CC::GE:
|
||||
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
|
||||
case AArch64CC::GT:
|
||||
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
|
||||
case AArch64CC::LE:
|
||||
if (!NoNans)
|
||||
return SDValue();
|
||||
// If we ignore NaNs then we can use to the LS implementation.
|
||||
[[fallthrough]];
|
||||
case AArch64CC::LS:
|
||||
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
|
||||
case AArch64CC::LT:
|
||||
if (!NoNans)
|
||||
return SDValue();
|
||||
// If we ignore NaNs then we can use to the MI implementation.
|
||||
[[fallthrough]];
|
||||
case AArch64CC::MI:
|
||||
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
if (Op.getValueType().isScalableVector())
|
||||
@@ -15737,15 +15837,14 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
|
||||
bool ShouldInvert;
|
||||
changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
|
||||
|
||||
bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
|
||||
SDValue Cmp =
|
||||
EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
|
||||
bool NoNaNs =
|
||||
getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
|
||||
SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
|
||||
if (!Cmp.getNode())
|
||||
return SDValue();
|
||||
|
||||
if (CC2 != AArch64CC::AL) {
|
||||
SDValue Cmp2 =
|
||||
EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
|
||||
SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
|
||||
if (!Cmp2.getNode())
|
||||
return SDValue();
|
||||
|
||||
@@ -25502,6 +25601,28 @@ static SDValue performDUPCombine(SDNode *N,
|
||||
}
|
||||
|
||||
if (N->getOpcode() == AArch64ISD::DUP) {
|
||||
// If the instruction is known to produce a scalar in SIMD registers, we can
|
||||
// duplicate it across the vector lanes using DUPLANE instead of moving it
|
||||
// to a GPR first. For example, this allows us to handle:
|
||||
// v4i32 = DUP (i32 (FCMGT (f32, f32)))
|
||||
SDValue Op = N->getOperand(0);
|
||||
// FIXME: Ideally, we should be able to handle all instructions that
|
||||
// produce a scalar value in FPRs.
|
||||
if (Op.getOpcode() == AArch64ISD::FCMEQ ||
|
||||
Op.getOpcode() == AArch64ISD::FCMGE ||
|
||||
Op.getOpcode() == AArch64ISD::FCMGT) {
|
||||
EVT ElemVT = VT.getVectorElementType();
|
||||
EVT ExpandedVT = VT;
|
||||
// Insert into a 128-bit vector to match DUPLANE's pattern.
|
||||
if (VT.getSizeInBits() != 128)
|
||||
ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
|
||||
128 / ElemVT.getSizeInBits());
|
||||
SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
|
||||
SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
|
||||
DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
|
||||
return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
|
||||
}
|
||||
|
||||
if (DCI.isAfterLegalizeDAG()) {
|
||||
// If scalar dup's operand is extract_vector_elt, try to combine them into
|
||||
// duplane. For example,
|
||||
|
||||
@@ -647,7 +647,9 @@ private:
|
||||
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
|
||||
SDValue TVal, SDValue FVal, const SDLoc &dl,
|
||||
SDValue TVal, SDValue FVal,
|
||||
iterator_range<SDNode::user_iterator> Users,
|
||||
bool HasNoNans, const SDLoc &dl,
|
||||
SelectionDAG &DAG) const;
|
||||
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
@@ -174,9 +174,9 @@ define <1 x i16> @test_select_f16_i16(half %i105, half %in, <1 x i16> %x, <1 x i
|
||||
; CHECK-LABEL: test_select_f16_i16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: fcvt s0, h0
|
||||
; CHECK-NEXT: fcmp s0, s0
|
||||
; CHECK-NEXT: csetm w8, vs
|
||||
; CHECK-NEXT: dup v0.4h, w8
|
||||
; CHECK-NEXT: fcmeq s0, s0, s0
|
||||
; CHECK-NEXT: mvn v0.16b, v0.16b
|
||||
; CHECK-NEXT: dup v0.4h, v0.h[0]
|
||||
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
|
||||
; CHECK-NEXT: ret
|
||||
%i179 = fcmp uno half %i105, zeroinitializer
|
||||
|
||||
378
llvm/test/CodeGen/AArch64/build-vector-dup-simd.ll
Normal file
378
llvm/test/CodeGen/AArch64/build-vector-dup-simd.ll
Normal file
@@ -0,0 +1,378 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-NOFULLFP16
|
||||
; RUN: llc < %s -mtriple=aarch64 --enable-no-nans-fp-math | FileCheck %s --check-prefixes=CHECK,CHECK-NONANS
|
||||
; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FULLFP16
|
||||
|
||||
define <1 x float> @dup_v1i32_oeq(float %a, float %b) {
|
||||
; CHECK-LABEL: dup_v1i32_oeq:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmeq s0, s0, s1
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp oeq float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_ogt(float %a, float %b) {
|
||||
; CHECK-LABEL: dup_v1i32_ogt:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmgt s0, s0, s1
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp ogt float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_oge(float %a, float %b) {
|
||||
; CHECK-LABEL: dup_v1i32_oge:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmge s0, s0, s1
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp oge float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_olt(float %a, float %b) {
|
||||
; CHECK-LABEL: dup_v1i32_olt:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp olt float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_ole(float %a, float %b) {
|
||||
; CHECK-LABEL: dup_v1i32_ole:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmge s0, s1, s0
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp ole float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_one(float %a, float %b) {
|
||||
; CHECK-NOFULLFP16-LABEL: dup_v1i32_one:
|
||||
; CHECK-NOFULLFP16: // %bb.0: // %entry
|
||||
; CHECK-NOFULLFP16-NEXT: fcmgt s2, s0, s1
|
||||
; CHECK-NOFULLFP16-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-NOFULLFP16-NEXT: orr v0.16b, v0.16b, v2.16b
|
||||
; CHECK-NOFULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
|
||||
; CHECK-NOFULLFP16-NEXT: ret
|
||||
;
|
||||
; CHECK-NONANS-LABEL: dup_v1i32_one:
|
||||
; CHECK-NONANS: // %bb.0: // %entry
|
||||
; CHECK-NONANS-NEXT: fcmeq s0, s0, s1
|
||||
; CHECK-NONANS-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-NONANS-NEXT: ret
|
||||
;
|
||||
; CHECK-FULLFP16-LABEL: dup_v1i32_one:
|
||||
; CHECK-FULLFP16: // %bb.0: // %entry
|
||||
; CHECK-FULLFP16-NEXT: fcmgt s2, s0, s1
|
||||
; CHECK-FULLFP16-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-FULLFP16-NEXT: orr v0.16b, v0.16b, v2.16b
|
||||
; CHECK-FULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
|
||||
; CHECK-FULLFP16-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp one float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_ord(float %a, float %b) {
|
||||
; CHECK-LABEL: dup_v1i32_ord:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmge s2, s0, s1
|
||||
; CHECK-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
|
||||
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp ord float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_ueq(float %a, float %b) {
|
||||
; CHECK-NOFULLFP16-LABEL: dup_v1i32_ueq:
|
||||
; CHECK-NOFULLFP16: // %bb.0: // %entry
|
||||
; CHECK-NOFULLFP16-NEXT: fcmgt s2, s0, s1
|
||||
; CHECK-NOFULLFP16-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-NOFULLFP16-NEXT: orr v0.16b, v0.16b, v2.16b
|
||||
; CHECK-NOFULLFP16-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-NOFULLFP16-NEXT: ret
|
||||
;
|
||||
; CHECK-NONANS-LABEL: dup_v1i32_ueq:
|
||||
; CHECK-NONANS: // %bb.0: // %entry
|
||||
; CHECK-NONANS-NEXT: fcmeq s0, s0, s1
|
||||
; CHECK-NONANS-NEXT: ret
|
||||
;
|
||||
; CHECK-FULLFP16-LABEL: dup_v1i32_ueq:
|
||||
; CHECK-FULLFP16: // %bb.0: // %entry
|
||||
; CHECK-FULLFP16-NEXT: fcmgt s2, s0, s1
|
||||
; CHECK-FULLFP16-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-FULLFP16-NEXT: orr v0.16b, v0.16b, v2.16b
|
||||
; CHECK-FULLFP16-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-FULLFP16-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp ueq float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_ugt(float %a, float %b) {
|
||||
; CHECK-NOFULLFP16-LABEL: dup_v1i32_ugt:
|
||||
; CHECK-NOFULLFP16: // %bb.0: // %entry
|
||||
; CHECK-NOFULLFP16-NEXT: fcmge s0, s1, s0
|
||||
; CHECK-NOFULLFP16-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-NOFULLFP16-NEXT: ret
|
||||
;
|
||||
; CHECK-NONANS-LABEL: dup_v1i32_ugt:
|
||||
; CHECK-NONANS: // %bb.0: // %entry
|
||||
; CHECK-NONANS-NEXT: fcmgt s0, s0, s1
|
||||
; CHECK-NONANS-NEXT: ret
|
||||
;
|
||||
; CHECK-FULLFP16-LABEL: dup_v1i32_ugt:
|
||||
; CHECK-FULLFP16: // %bb.0: // %entry
|
||||
; CHECK-FULLFP16-NEXT: fcmge s0, s1, s0
|
||||
; CHECK-FULLFP16-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-FULLFP16-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp ugt float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_uge(float %a, float %b) {
|
||||
; CHECK-NOFULLFP16-LABEL: dup_v1i32_uge:
|
||||
; CHECK-NOFULLFP16: // %bb.0: // %entry
|
||||
; CHECK-NOFULLFP16-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-NOFULLFP16-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-NOFULLFP16-NEXT: ret
|
||||
;
|
||||
; CHECK-NONANS-LABEL: dup_v1i32_uge:
|
||||
; CHECK-NONANS: // %bb.0: // %entry
|
||||
; CHECK-NONANS-NEXT: fcmge s0, s0, s1
|
||||
; CHECK-NONANS-NEXT: ret
|
||||
;
|
||||
; CHECK-FULLFP16-LABEL: dup_v1i32_uge:
|
||||
; CHECK-FULLFP16: // %bb.0: // %entry
|
||||
; CHECK-FULLFP16-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-FULLFP16-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-FULLFP16-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp uge float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_ult(float %a, float %b) {
|
||||
; CHECK-NOFULLFP16-LABEL: dup_v1i32_ult:
|
||||
; CHECK-NOFULLFP16: // %bb.0: // %entry
|
||||
; CHECK-NOFULLFP16-NEXT: fcmge s0, s0, s1
|
||||
; CHECK-NOFULLFP16-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-NOFULLFP16-NEXT: ret
|
||||
;
|
||||
; CHECK-NONANS-LABEL: dup_v1i32_ult:
|
||||
; CHECK-NONANS: // %bb.0: // %entry
|
||||
; CHECK-NONANS-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-NONANS-NEXT: ret
|
||||
;
|
||||
; CHECK-FULLFP16-LABEL: dup_v1i32_ult:
|
||||
; CHECK-FULLFP16: // %bb.0: // %entry
|
||||
; CHECK-FULLFP16-NEXT: fcmge s0, s0, s1
|
||||
; CHECK-FULLFP16-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-FULLFP16-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp ult float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_ule(float %a, float %b) {
|
||||
; CHECK-NOFULLFP16-LABEL: dup_v1i32_ule:
|
||||
; CHECK-NOFULLFP16: // %bb.0: // %entry
|
||||
; CHECK-NOFULLFP16-NEXT: fcmgt s0, s0, s1
|
||||
; CHECK-NOFULLFP16-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-NOFULLFP16-NEXT: ret
|
||||
;
|
||||
; CHECK-NONANS-LABEL: dup_v1i32_ule:
|
||||
; CHECK-NONANS: // %bb.0: // %entry
|
||||
; CHECK-NONANS-NEXT: fcmge s0, s1, s0
|
||||
; CHECK-NONANS-NEXT: ret
|
||||
;
|
||||
; CHECK-FULLFP16-LABEL: dup_v1i32_ule:
|
||||
; CHECK-FULLFP16: // %bb.0: // %entry
|
||||
; CHECK-FULLFP16-NEXT: fcmgt s0, s0, s1
|
||||
; CHECK-FULLFP16-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-FULLFP16-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp ule float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_une(float %a, float %b) {
|
||||
; CHECK-LABEL: dup_v1i32_une:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmeq s0, s0, s1
|
||||
; CHECK-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp une float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <1 x float> @dup_v1i32_uno(float %a, float %b) {
|
||||
; CHECK-LABEL: dup_v1i32_uno:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmge s2, s0, s1
|
||||
; CHECK-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
|
||||
; CHECK-NEXT: mvn v0.8b, v0.8b
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp uno float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <1 x i32> %vecinit.i to <1 x float>
|
||||
ret <1 x float> %1
|
||||
}
|
||||
|
||||
define <4 x float> @dup_v4i32(float %a, float %b) {
|
||||
; CHECK-LABEL: dup_v4i32:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmge s0, s0, s1
|
||||
; CHECK-NEXT: dup v0.4s, v0.s[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp oge float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <4 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <4 x i32> %vecinit.i to <4 x float>
|
||||
%2 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> zeroinitializer
|
||||
ret <4 x float> %2
|
||||
}
|
||||
|
||||
define <4 x float> @dup_v4i32_reversed(float %a, float %b) {
|
||||
; CHECK-LABEL: dup_v4i32_reversed:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmgt s0, s1, s0
|
||||
; CHECK-NEXT: dup v0.4s, v0.s[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp ogt float %b, %a
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
%vecinit.i = insertelement <4 x i32> poison, i32 %vcmpd.i, i64 0
|
||||
%1 = bitcast <4 x i32> %vecinit.i to <4 x float>
|
||||
%2 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> zeroinitializer
|
||||
ret <4 x float> %2
|
||||
}
|
||||
|
||||
define <2 x double> @dup_v2i64(double %a, double %b) {
|
||||
; CHECK-LABEL: dup_v2i64:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmgt d0, d0, d1
|
||||
; CHECK-NEXT: dup v0.2d, v0.d[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp ogt double %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i64
|
||||
%vecinit.i = insertelement <2 x i64> poison, i64 %vcmpd.i, i64 0
|
||||
%1 = bitcast <2 x i64> %vecinit.i to <2 x double>
|
||||
%2 = shufflevector <2 x double> %1, <2 x double> poison, <2 x i32> zeroinitializer
|
||||
ret <2 x double> %2
|
||||
}
|
||||
|
||||
define <8 x half> @dup_v8i16(half %a, half %b) {
|
||||
; CHECK-NOFULLFP16-LABEL: dup_v8i16:
|
||||
; CHECK-NOFULLFP16: // %bb.0: // %entry
|
||||
; CHECK-NOFULLFP16-NEXT: fcvt s1, h1
|
||||
; CHECK-NOFULLFP16-NEXT: fcvt s0, h0
|
||||
; CHECK-NOFULLFP16-NEXT: fcmeq s0, s0, s1
|
||||
; CHECK-NOFULLFP16-NEXT: ret
|
||||
;
|
||||
; CHECK-NONANS-LABEL: dup_v8i16:
|
||||
; CHECK-NONANS: // %bb.0: // %entry
|
||||
; CHECK-NONANS-NEXT: fcvt s1, h1
|
||||
; CHECK-NONANS-NEXT: fcvt s0, h0
|
||||
; CHECK-NONANS-NEXT: fcmeq s0, s0, s1
|
||||
; CHECK-NONANS-NEXT: ret
|
||||
;
|
||||
; CHECK-FULLFP16-LABEL: dup_v8i16:
|
||||
; CHECK-FULLFP16: // %bb.0: // %entry
|
||||
; CHECK-FULLFP16-NEXT: fcmp h0, h1
|
||||
; CHECK-FULLFP16-NEXT: csetm w8, eq
|
||||
; CHECK-FULLFP16-NEXT: fmov s0, w8
|
||||
; CHECK-FULLFP16-NEXT: ret
|
||||
; FIXME: Could be replaced with fcmeq + dup but the type of the former is
|
||||
; promoted to i32 during selection and then the optimization does not apply.
|
||||
|
||||
entry:
|
||||
%0 = fcmp oeq half %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i16
|
||||
%vecinit.i = insertelement <8 x i16> poison, i16 %vcmpd.i, i64 0
|
||||
%1 = bitcast <8 x i16> %vecinit.i to <8 x half>
|
||||
ret <8 x half> %1
|
||||
}
|
||||
|
||||
; Check that a mask is not generated for non-vectorized users.
|
||||
define i32 @mask_i32(float %a, float %b) {
|
||||
; CHECK-LABEL: mask_i32:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmp s0, s1
|
||||
; CHECK-NEXT: csetm w0, eq
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp oeq float %a, %b
|
||||
%vcmpd.i = sext i1 %0 to i32
|
||||
ret i32 %vcmpd.i
|
||||
}
|
||||
|
||||
; Verify that a mask is not emitted when (allOnes, allZeros) are not the
|
||||
; operands for the SELECT_CC.
|
||||
define i32 @bool_i32(float %a, float %b) {
|
||||
; CHECK-LABEL: bool_i32:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: fcmp s0, s1
|
||||
; CHECK-NEXT: cset w0, eq
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = fcmp oeq float %a, %b
|
||||
%vcmpd.i = zext i1 %0 to i32
|
||||
ret i32 %vcmpd.i
|
||||
}
|
||||
Reference in New Issue
Block a user