Improve WebAssembly vector bitmask, mask reduction, and extending
This is inspired by a recently filed Rust issue noting poor codegen for vector masks (https://github.com/rust-lang/portable-simd/issues/351). Reviewed By: tlively Differential Revision: https://reviews.llvm.org/D151782
This commit is contained in:
committed by
Thomas Lively
parent
867ee3b8a7
commit
8392bf6000
@@ -157,6 +157,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
|
||||
|
||||
// SIMD-specific configuration
|
||||
if (Subtarget->hasSIMD128()) {
|
||||
// Combine vector mask reductions into alltrue/anytrue
|
||||
setTargetDAGCombine(ISD::SETCC);
|
||||
|
||||
// Convert vector to integer bitcasts to bitmask
|
||||
setTargetDAGCombine(ISD::BITCAST);
|
||||
|
||||
// Hoist bitcasts out of shuffles
|
||||
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
|
||||
|
||||
@@ -258,6 +264,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
|
||||
// But saturating fp_to_int converstions are
|
||||
for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT})
|
||||
setOperationAction(Op, MVT::v4i32, Custom);
|
||||
|
||||
// Support vector extending
|
||||
for (auto T : MVT::integer_fixedlen_vector_valuetypes()) {
|
||||
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom);
|
||||
}
|
||||
}
|
||||
|
||||
// As a special case, these operators use the type to mean the type to
|
||||
@@ -1374,6 +1386,11 @@ void WebAssemblyTargetLowering::ReplaceNodeResults(
|
||||
// SIGN_EXTEND_INREG, but for non-vector sign extends the result might be an
|
||||
// illegal type.
|
||||
break;
|
||||
case ISD::SIGN_EXTEND_VECTOR_INREG:
|
||||
case ISD::ZERO_EXTEND_VECTOR_INREG:
|
||||
// Do not add any results, signifying that N should not be custom lowered.
|
||||
// EXTEND_VECTOR_INREG is implemented for some vectors, but not all.
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable(
|
||||
"ReplaceNodeResults not implemented for this op for WebAssembly!");
|
||||
@@ -1424,6 +1441,9 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
|
||||
return LowerIntrinsic(Op, DAG);
|
||||
case ISD::SIGN_EXTEND_INREG:
|
||||
return LowerSIGN_EXTEND_INREG(Op, DAG);
|
||||
case ISD::ZERO_EXTEND_VECTOR_INREG:
|
||||
case ISD::SIGN_EXTEND_VECTOR_INREG:
|
||||
return LowerEXTEND_VECTOR_INREG(Op, DAG);
|
||||
case ISD::BUILD_VECTOR:
|
||||
return LowerBUILD_VECTOR(Op, DAG);
|
||||
case ISD::VECTOR_SHUFFLE:
|
||||
@@ -1877,6 +1897,48 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
|
||||
Op.getOperand(1));
|
||||
}
|
||||
|
||||
SDValue
|
||||
WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
SDLoc DL(Op);
|
||||
EVT VT = Op.getValueType();
|
||||
SDValue Src = Op.getOperand(0);
|
||||
EVT SrcVT = Src.getValueType();
|
||||
|
||||
if (SrcVT.getVectorElementType() == MVT::i1 ||
|
||||
SrcVT.getVectorElementType() == MVT::i64)
|
||||
return SDValue();
|
||||
|
||||
assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
|
||||
"Unexpected extension factor.");
|
||||
unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
|
||||
|
||||
if (Scale != 2 && Scale != 4 && Scale != 8)
|
||||
return SDValue();
|
||||
|
||||
unsigned Ext;
|
||||
switch (Op.getOpcode()) {
|
||||
case ISD::ZERO_EXTEND_VECTOR_INREG:
|
||||
Ext = WebAssemblyISD::EXTEND_LOW_U;
|
||||
break;
|
||||
case ISD::SIGN_EXTEND_VECTOR_INREG:
|
||||
Ext = WebAssemblyISD::EXTEND_LOW_S;
|
||||
break;
|
||||
}
|
||||
|
||||
SDValue Ret = Src;
|
||||
while (Scale != 1) {
|
||||
Ret = DAG.getNode(Ext, DL,
|
||||
Ret.getValueType()
|
||||
.widenIntegerVectorElementType(*DAG.getContext())
|
||||
.getHalfNumVectorElementsVT(*DAG.getContext()),
|
||||
Ret);
|
||||
Scale /= 2;
|
||||
}
|
||||
assert(Ret.getValueType() == VT);
|
||||
return Ret;
|
||||
}
|
||||
|
||||
static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
|
||||
SDLoc DL(Op);
|
||||
if (Op.getValueType() != MVT::v2f64)
|
||||
@@ -2692,12 +2754,90 @@ static SDValue performTruncateCombine(SDNode *N,
|
||||
return truncateVectorWithNARROW(OutVT, In, DL, DAG);
|
||||
}
|
||||
|
||||
static SDValue performBitcastCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
auto &DAG = DCI.DAG;
|
||||
SDLoc DL(N);
|
||||
SDValue Src = N->getOperand(0);
|
||||
EVT VT = N->getValueType(0);
|
||||
EVT SrcVT = Src.getValueType();
|
||||
|
||||
// bitcast <N x i1> to iN
|
||||
// ==> bitmask
|
||||
if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
|
||||
SrcVT.isFixedLengthVector() && SrcVT.getScalarType() == MVT::i1) {
|
||||
unsigned NumElts = SrcVT.getVectorNumElements();
|
||||
assert(NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16);
|
||||
EVT Width = MVT::getIntegerVT(128 / NumElts);
|
||||
return DAG.getZExtOrTrunc(
|
||||
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
|
||||
{DAG.getConstant(Intrinsic::wasm_bitmask, DL, MVT::i32),
|
||||
DAG.getSExtOrTrunc(N->getOperand(0), DL,
|
||||
SrcVT.changeVectorElementType(Width))}),
|
||||
DL, VT);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue performSETCCCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
auto &DAG = DCI.DAG;
|
||||
|
||||
SDValue LHS = N->getOperand(0);
|
||||
SDValue RHS = N->getOperand(1);
|
||||
ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
||||
SDLoc DL(N);
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
// setcc (iN (bitcast (vNi1 X))), 0, ne
|
||||
// ==> any_true (vNi1 X)
|
||||
// setcc (iN (bitcast (vNi1 X))), 0, eq
|
||||
// ==> xor (any_true (vNi1 X)), -1
|
||||
// setcc (iN (bitcast (vNi1 X))), -1, eq
|
||||
// ==> all_true (vNi1 X)
|
||||
// setcc (iN (bitcast (vNi1 X))), -1, ne
|
||||
// ==> xor (all_true (vNi1 X)), -1
|
||||
if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
|
||||
(Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
|
||||
(isNullConstant(RHS) || isAllOnesConstant(RHS)) &&
|
||||
LHS->getOpcode() == ISD::BITCAST) {
|
||||
EVT FromVT = LHS->getOperand(0).getValueType();
|
||||
if (FromVT.isFixedLengthVector() &&
|
||||
FromVT.getVectorElementType() == MVT::i1) {
|
||||
int Intrin = isNullConstant(RHS) ? Intrinsic::wasm_anytrue
|
||||
: Intrinsic::wasm_alltrue;
|
||||
unsigned NumElts = FromVT.getVectorNumElements();
|
||||
assert(NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16);
|
||||
EVT Width = MVT::getIntegerVT(128 / NumElts);
|
||||
SDValue Ret = DAG.getZExtOrTrunc(
|
||||
DAG.getNode(
|
||||
ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
|
||||
{DAG.getConstant(Intrin, DL, MVT::i32),
|
||||
DAG.getSExtOrTrunc(LHS->getOperand(0), DL,
|
||||
FromVT.changeVectorElementType(Width))}),
|
||||
DL, MVT::i1);
|
||||
if ((isNullConstant(RHS) && (Cond == ISD::SETEQ)) ||
|
||||
(isAllOnesConstant(RHS) && (Cond == ISD::SETNE))) {
|
||||
Ret = DAG.getNOT(DL, Ret, MVT::i1);
|
||||
}
|
||||
return DAG.getZExtOrTrunc(Ret, DL, VT);
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue
|
||||
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
switch (N->getOpcode()) {
|
||||
default:
|
||||
return SDValue();
|
||||
case ISD::BITCAST:
|
||||
return performBitcastCombine(N, DCI);
|
||||
case ISD::SETCC:
|
||||
return performSETCCCombine(N, DCI);
|
||||
case ISD::VECTOR_SHUFFLE:
|
||||
return performVECTOR_SHUFFLECombine(N, DCI);
|
||||
case ISD::SIGN_EXTEND:
|
||||
|
||||
@@ -131,6 +131,7 @@ private:
|
||||
SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerIntrinsic(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
@@ -36,9 +36,9 @@ define <4 x float> @extend_to_float_low_i8x16_u(<8 x i8> %x) {
|
||||
; CHECK-LABEL: extend_to_float_low_i8x16_u:
|
||||
; CHECK: .functype extend_to_float_low_i8x16_u (v128) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: f32x4.convert_i32x4_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
@@ -51,8 +51,10 @@ define <4 x float> @extend_to_float_high_i8x16_u(<8 x i8> %x) {
|
||||
; CHECK: .functype extend_to_float_high_i8x16_u (v128) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: i8x16.shuffle 4, 17, 18, 19, 5, 21, 22, 23, 6, 25, 26, 27, 7, 29, 30, 31
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: f32x4.convert_i32x4_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -91,12 +93,8 @@ define <4 x float> @extend_to_float_low_i8x16_s(<8 x i8> %x) {
|
||||
; CHECK: .functype extend_to_float_low_i8x16_s (v128) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: f32x4.convert_i32x4_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
@@ -110,11 +108,9 @@ define <4 x float> @extend_to_float_high_i8x16_s(<8 x i8> %x) {
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i8x16.shuffle 4, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: f32x4.convert_i32x4_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -138,9 +134,8 @@ define <2 x double> @extend_to_double_low_i16x4_u(<4 x i16> %x) {
|
||||
; CHECK-LABEL: extend_to_double_low_i16x4_u:
|
||||
; CHECK: .functype extend_to_double_low_i16x4_u (v128) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i8x16.shuffle 16, 17, 2, 3, 18, 19, 6, 7, 20, 21, 10, 11, 22, 23, 14, 15
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: f64x2.convert_low_i32x4_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%low = shufflevector <4 x i16> %x, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
|
||||
|
||||
@@ -170,11 +170,8 @@ define <8 x i16> @extend_lowish_i8x16_s(<16 x i8> %v) {
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i8x16.shuffle 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
|
||||
; CHECK-NEXT: i32.const 8
|
||||
; CHECK-NEXT: i16x8.shl
|
||||
; CHECK-NEXT: i32.const 8
|
||||
; CHECK-NEXT: i16x8.shr_s
|
||||
; CHECK-NEXT: i8x16.shuffle 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%lowish = shufflevector <16 x i8> %v, <16 x i8> undef,
|
||||
<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
||||
@@ -188,14 +185,81 @@ define <4 x i32> @extend_lowish_i16x8_s(<8 x i16> %v) {
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i8x16.shuffle 2, 3, 0, 1, 4, 5, 0, 1, 6, 7, 0, 1, 8, 9, 0, 1
|
||||
; CHECK-NEXT: i32.const 16
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 16
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i8x16.shuffle 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 0, 1, 0, 1, 0, 1
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%lowish = shufflevector <8 x i16> %v, <8 x i16> undef,
|
||||
<4 x i32> <i32 1, i32 2, i32 3, i32 4>
|
||||
%extended = sext <4 x i16> %lowish to <4 x i32>
|
||||
ret <4 x i32> %extended
|
||||
}
|
||||
|
||||
;; Also test vectors that aren't full 128 bits, or might require
|
||||
;; multiple extensions
|
||||
|
||||
define <16 x i8> @extend_i1x16_i8(<16 x i1> %v) {
|
||||
; CHECK-LABEL: extend_i1x16_i8:
|
||||
; CHECK: .functype extend_i1x16_i8 (v128) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i32.const 7
|
||||
; CHECK-NEXT: i8x16.shl
|
||||
; CHECK-NEXT: i32.const 7
|
||||
; CHECK-NEXT: i8x16.shr_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%extended = sext <16 x i1> %v to <16 x i8>
|
||||
ret <16 x i8> %extended
|
||||
}
|
||||
|
||||
define <8 x i8> @extend_i1x8_i8(<8 x i1> %v) {
|
||||
; CHECK-LABEL: extend_i1x8_i8:
|
||||
; CHECK: .functype extend_i1x8_i8 (v128) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 7
|
||||
; CHECK-NEXT: i8x16.shl
|
||||
; CHECK-NEXT: i32.const 7
|
||||
; CHECK-NEXT: i8x16.shr_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%extended = sext <8 x i1> %v to <8 x i8>
|
||||
ret <8 x i8> %extended
|
||||
}
|
||||
|
||||
define <8 x i16> @extend_i1x8_i16(<8 x i1> %v) {
|
||||
; CHECK-LABEL: extend_i1x8_i16:
|
||||
; CHECK: .functype extend_i1x8_i16 (v128) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: v128.const 1, 1, 1, 1, 1, 1, 1, 1
|
||||
; CHECK-NEXT: v128.and
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%extended = zext <8 x i1> %v to <8 x i16>
|
||||
ret <8 x i16> %extended
|
||||
}
|
||||
|
||||
define <4 x i32> @extend_i8x4_i32(<4 x i8> %v) {
|
||||
; CHECK-LABEL: extend_i8x4_i32:
|
||||
; CHECK: .functype extend_i8x4_i32 (v128) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%extended = zext <4 x i8> %v to <4 x i32>
|
||||
ret <4 x i32> %extended
|
||||
}
|
||||
|
||||
define <2 x i64> @extend_i8x2_i64(<2 x i8> %v) {
|
||||
; CHECK-LABEL: extend_i8x2_i64:
|
||||
; CHECK: .functype extend_i8x2_i64 (v128) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: i64x2.extend_low_i32x4_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%extended = sext <2 x i8> %v to <2 x i64>
|
||||
ret <2 x i64> %extended
|
||||
}
|
||||
|
||||
@@ -1183,16 +1183,11 @@ define <4 x i32> @load_zext_v4i16_to_v4i32(ptr %p) {
|
||||
define <4 x i32> @load_sext_v4i8_to_v4i32(ptr %p) {
|
||||
; CHECK-LABEL: load_sext_v4i8_to_v4i32:
|
||||
; CHECK: .functype load_sext_v4i8_to_v4i32 (i32) -> (v128)
|
||||
; CHECK-NEXT: .local v128
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: v128.load32_zero 0
|
||||
; CHECK-NEXT: local.get 1
|
||||
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%v = load <4 x i8>, ptr %p
|
||||
%v2 = sext <4 x i8> %v to <4 x i32>
|
||||
@@ -1203,10 +1198,10 @@ define <4 x i32> @load_zext_v4i8_to_v4i32(ptr %p) {
|
||||
; CHECK-LABEL: load_zext_v4i8_to_v4i32:
|
||||
; CHECK: .functype load_zext_v4i8_to_v4i32 (i32) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: v128.load32_zero 0
|
||||
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%v = load <4 x i8>, ptr %p
|
||||
%v2 = zext <4 x i8> %v to <4 x i32>
|
||||
@@ -1287,16 +1282,11 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_with_folded_offset(ptr %p) {
|
||||
define <4 x i32> @load_sext_v4i8_to_v4i32_with_folded_offset(ptr %p) {
|
||||
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_folded_offset:
|
||||
; CHECK: .functype load_sext_v4i8_to_v4i32_with_folded_offset (i32) -> (v128)
|
||||
; CHECK-NEXT: .local v128
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: v128.load32_zero 16
|
||||
; CHECK-NEXT: local.get 1
|
||||
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%q = ptrtoint ptr %p to i32
|
||||
%r = add nuw i32 %q, 16
|
||||
@@ -1310,10 +1300,10 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_with_folded_offset(ptr %p) {
|
||||
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_folded_offset:
|
||||
; CHECK: .functype load_zext_v4i8_to_v4i32_with_folded_offset (i32) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: v128.load32_zero 16
|
||||
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%q = ptrtoint ptr %p to i32
|
||||
%r = add nuw i32 %q, 16
|
||||
@@ -1392,16 +1382,11 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_with_folded_gep_offset(ptr %p) {
|
||||
define <4 x i32> @load_sext_v4i8_to_v4i32_with_folded_gep_offset(ptr %p) {
|
||||
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_folded_gep_offset:
|
||||
; CHECK: .functype load_sext_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128)
|
||||
; CHECK-NEXT: .local v128
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: v128.load32_zero 4
|
||||
; CHECK-NEXT: local.get 1
|
||||
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%s = getelementptr inbounds <4 x i8>, ptr %p, i32 1
|
||||
%v = load <4 x i8>, ptr %s
|
||||
@@ -1413,10 +1398,10 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_with_folded_gep_offset(ptr %p) {
|
||||
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_folded_gep_offset:
|
||||
; CHECK: .functype load_zext_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: v128.load32_zero 4
|
||||
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%s = getelementptr inbounds <4 x i8>, ptr %p, i32 1
|
||||
%v = load <4 x i8>, ptr %s
|
||||
@@ -1499,18 +1484,13 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_gep_negative_offset(ptr
|
||||
define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset(ptr %p) {
|
||||
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset:
|
||||
; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
|
||||
; CHECK-NEXT: .local v128
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i32.const -4
|
||||
; CHECK-NEXT: i32.add
|
||||
; CHECK-NEXT: v128.load32_zero 0
|
||||
; CHECK-NEXT: local.get 1
|
||||
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%s = getelementptr inbounds <4 x i8>, ptr %p, i32 -1
|
||||
%v = load <4 x i8>, ptr %s
|
||||
@@ -1522,12 +1502,12 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset(ptr
|
||||
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset:
|
||||
; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i32.const -4
|
||||
; CHECK-NEXT: i32.add
|
||||
; CHECK-NEXT: v128.load32_zero 0
|
||||
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%s = getelementptr inbounds <4 x i8>, ptr %p, i32 -1
|
||||
%v = load <4 x i8>, ptr %s
|
||||
@@ -1620,18 +1600,13 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_offset(ptr %p) {
|
||||
define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_offset(ptr %p) {
|
||||
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_offset:
|
||||
; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128)
|
||||
; CHECK-NEXT: .local v128
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i32.const 16
|
||||
; CHECK-NEXT: i32.add
|
||||
; CHECK-NEXT: v128.load32_zero 0
|
||||
; CHECK-NEXT: local.get 1
|
||||
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%q = ptrtoint ptr %p to i32
|
||||
%r = add nsw i32 %q, 16
|
||||
@@ -1645,12 +1620,12 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_offset(ptr %p) {
|
||||
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_offset:
|
||||
; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i32.const 16
|
||||
; CHECK-NEXT: i32.add
|
||||
; CHECK-NEXT: v128.load32_zero 0
|
||||
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%q = ptrtoint ptr %p to i32
|
||||
%r = add nsw i32 %q, 16
|
||||
@@ -1739,18 +1714,13 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_gep_offset(ptr %p) {
|
||||
define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_gep_offset(ptr %p) {
|
||||
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_gep_offset:
|
||||
; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
|
||||
; CHECK-NEXT: .local v128
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i32.const 4
|
||||
; CHECK-NEXT: i32.add
|
||||
; CHECK-NEXT: v128.load32_zero 0
|
||||
; CHECK-NEXT: local.get 1
|
||||
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%s = getelementptr <4 x i8>, ptr %p, i32 1
|
||||
%v = load <4 x i8>, ptr %s
|
||||
@@ -1762,12 +1732,12 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_gep_offset(ptr %p) {
|
||||
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_gep_offset:
|
||||
; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i32.const 4
|
||||
; CHECK-NEXT: i32.add
|
||||
; CHECK-NEXT: v128.load32_zero 0
|
||||
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%s = getelementptr <4 x i8>, ptr %p, i32 1
|
||||
%v = load <4 x i8>, ptr %s
|
||||
@@ -1844,16 +1814,11 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_from_numeric_address() {
|
||||
define <4 x i32> @load_sext_v4i8_to_v4i32_from_numeric_address() {
|
||||
; CHECK-LABEL: load_sext_v4i8_to_v4i32_from_numeric_address:
|
||||
; CHECK: .functype load_sext_v4i8_to_v4i32_from_numeric_address () -> (v128)
|
||||
; CHECK-NEXT: .local v128
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: i32.const 0
|
||||
; CHECK-NEXT: v128.load32_zero 32
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%s = inttoptr i32 32 to ptr
|
||||
%v = load <4 x i8>, ptr %s
|
||||
@@ -1865,10 +1830,10 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_from_numeric_address() {
|
||||
; CHECK-LABEL: load_zext_v4i8_to_v4i32_from_numeric_address:
|
||||
; CHECK: .functype load_zext_v4i8_to_v4i32_from_numeric_address () -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 0
|
||||
; CHECK-NEXT: v128.load32_zero 32
|
||||
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%s = inttoptr i32 32 to ptr
|
||||
%v = load <4 x i8>, ptr %s
|
||||
@@ -1943,16 +1908,11 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_from_global_address() {
|
||||
define <4 x i32> @load_sext_v4i8_to_v4i32_from_global_address() {
|
||||
; CHECK-LABEL: load_sext_v4i8_to_v4i32_from_global_address:
|
||||
; CHECK: .functype load_sext_v4i8_to_v4i32_from_global_address () -> (v128)
|
||||
; CHECK-NEXT: .local v128
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: i32.const 0
|
||||
; CHECK-NEXT: v128.load32_zero gv_v4i8
|
||||
; CHECK-NEXT: local.get 0
|
||||
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shl
|
||||
; CHECK-NEXT: i32.const 24
|
||||
; CHECK-NEXT: i32x4.shr_s
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_s
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_s
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%v = load <4 x i8>, ptr @gv_v4i8
|
||||
%v2 = sext <4 x i8> %v to <4 x i32>
|
||||
@@ -1963,10 +1923,10 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_from_global_address() {
|
||||
; CHECK-LABEL: load_zext_v4i8_to_v4i32_from_global_address:
|
||||
; CHECK: .functype load_zext_v4i8_to_v4i32_from_global_address () -> (v128)
|
||||
; CHECK-NEXT: # %bb.0:
|
||||
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
; CHECK-NEXT: i32.const 0
|
||||
; CHECK-NEXT: v128.load32_zero gv_v4i8
|
||||
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
|
||||
; CHECK-NEXT: i16x8.extend_low_i8x16_u
|
||||
; CHECK-NEXT: i32x4.extend_low_i16x8_u
|
||||
; CHECK-NEXT: # fallthrough-return
|
||||
%v = load <4 x i8>, ptr @gv_v4i8
|
||||
%v2 = zext <4 x i8> %v to <4 x i32>
|
||||
|
||||
Reference in New Issue
Block a user