[X86] combineConcatVectorOps - add concatenation handling for BITCAST nodes (#133913)

These nodes are effectively free, so we should only concatenate if the
inner nodes will concatenate together.

This also exposed a regression in canonicalizeShuffleWithOp that failed
to realize it could potentially merge shuffles with a CONCAT_VECTORS
node.
This commit is contained in:
Simon Pilgrim
2025-04-10 17:44:39 +01:00
committed by GitHub
parent 2f41fa387d
commit 750d009bb2
13 changed files with 3403 additions and 3522 deletions

View File

@@ -41704,6 +41704,7 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
(Op.getOpcode() == Opc && Op->hasOneUse()) ||
(Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
(Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
(FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
DAG.isSplatValue(Op, /*AllowUndefs*/ false);
};
@@ -58134,6 +58135,30 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
unsigned Opcode = Op0.getOpcode();
switch (Opcode) {
case ISD::BITCAST: {
// TODO: Support AVX1/AVX2 bitcasts.
SmallVector<SDValue, 4> SubOps;
for (SDValue SubOp : Ops)
SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
EVT InnerVT = SubOps[0].getValueType();
unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
(Subtarget.hasBWI() ||
(EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
((VT.is256BitVector() && Subtarget.hasVLX()) ||
(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
llvm::all_of(SubOps, [InnerVT](SDValue Op) {
return Op.getValueType() == InnerVT;
})) {
MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
MVT ConcatVT = MVT::getVectorVT(
ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
if (SDValue ConcatSrc = combineConcatVectorOps(
DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
return DAG.getBitcast(VT, ConcatSrc);
}
break;
}
case ISD::VECTOR_SHUFFLE: {
// TODO: Generalize NumOps support.
if (!IsSplat && NumOps == 2 &&

View File

@@ -123,20 +123,18 @@ define <8 x i64> @lshr_i512_1(<8 x i64> %a) {
;
; AVX512VBMI-LABEL: lshr_i512_1:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
; AVX512VBMI-NEXT: vpshldq $63, %xmm5, %xmm3, %xmm3
; AVX512VBMI-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512VBMI-NEXT: vpshldq $63, %xmm2, %xmm1, %xmm2
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX512VBMI-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512VBMI-NEXT: vpsrlq $1, %xmm2, %xmm2
; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
@@ -238,20 +236,18 @@ define <8 x i64> @ashr_i512_1(<8 x i64> %a) {
;
; AVX512VBMI-LABEL: ashr_i512_1:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
; AVX512VBMI-NEXT: vpshldq $63, %xmm5, %xmm3, %xmm3
; AVX512VBMI-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512VBMI-NEXT: vpshldq $63, %xmm2, %xmm1, %xmm2
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX512VBMI-NEXT: vpsraq $1, %xmm1, %xmm1
; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512VBMI-NEXT: vpsraq $1, %xmm2, %xmm2
; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]

View File

@@ -297,23 +297,21 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
;
; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u,33,37,41,45,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,49,53,57,62,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,12,0,5]
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BWVL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512BWVL-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u,33,37,41,45,u,u,u,u,u,u,u,u,u,u,u,u,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BWVL-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;

View File

@@ -1228,13 +1228,14 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94]
; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpermt2b %zmm4, %zmm3, %zmm0
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95]
; AVX512VBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3
; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
; AVX512VBMI2-NEXT: vpsrlw %xmm0, %zmm3, %zmm0
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
@@ -1251,16 +1252,29 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VLBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX10-LABEL: splatvar_funnnel_v32i8:
; AVX10: # %bb.0:
; AVX10-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX10-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX10-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
; AVX10-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX10-NEXT: vpsrlw %xmm2, %ymm0, %ymm1
; AVX10-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
; AVX10-NEXT: vpermi2b %ymm3, %ymm1, %ymm0
; AVX10-NEXT: retq
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95,0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87]
; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3
; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %zmm3, %zmm0
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [32,34,36,38,40,42,44,46,0,2,4,6,8,10,12,14,48,50,52,54,56,58,60,62,16,18,20,22,24,26,28,30]
; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VLVBMI2-NEXT: retq
;
; AVX10_256-LABEL: splatvar_funnnel_v32i8:
; AVX10_256: # %bb.0:
; AVX10_256-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX10_256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX10_256-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
; AVX10_256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX10_256-NEXT: vpsrlw %xmm2, %ymm0, %ymm1
; AVX10_256-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
; AVX10_256-NEXT: vpermi2b %ymm3, %ymm1, %ymm0
; AVX10_256-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
; XOPAVX1: # %bb.0:

View File

@@ -992,25 +992,26 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94]
; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm3, %zmm0
; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VBMI2-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpermt2b %zmm3, %zmm2, %zmm0
; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [32,34,36,38,40,42,44,46,0,2,4,6,8,10,12,14,48,50,52,54,56,58,60,62,16,18,20,22,24,26,28,30]
; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm3, %zmm0
; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm1
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
; AVX512VLVBMI2-NEXT: vpermi2b %ymm2, %ymm1, %ymm0
; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0
; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:

View File

@@ -130,13 +130,14 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512BW-FCP-LABEL: store_i16_stride4_vf2:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r8)
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm1
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,24,0,16,9,25,1,17]
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%r8)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: store_i16_stride4_vf2:
@@ -152,13 +153,14 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-BW-FCP-LABEL: store_i16_stride4_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r8)
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm1
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,24,0,16,9,25,1,17]
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%r8)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64
%in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64

View File

@@ -259,14 +259,14 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
; AVX512BW-NEXT: vpextrd $2, %xmm0, 24(%rax)
@@ -280,14 +280,14 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, 24(%rax)
@@ -301,14 +301,14 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, 24(%rax)
@@ -322,14 +322,14 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, 24(%rax)

View File

@@ -231,17 +231,17 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
; AVX512BW-NEXT: vmovdqa (%r11), %xmm3
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3
; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,48,0,32,24,56,8,40,17,49,1,33,25,57,9,41]
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqa %ymm2, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -252,17 +252,17 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,48,0,32,24,56,8,40,17,49,1,33,25,57,9,41]
; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -273,17 +273,17 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,48,0,32,24,56,8,40,17,49,1,33,25,57,9,41]
; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
@@ -294,17 +294,17 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,48,0,32,24,56,8,40,17,49,1,33,25,57,9,41]
; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq

View File

@@ -4668,23 +4668,23 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
; AVX512-NEXT: vpshufb %ymm1, %ymm11, %ymm0
; AVX512-NEXT: vmovdqa64 %ymm1, %ymm20
; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX512-NEXT: vmovdqa 32(%rdi), %ymm8
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm1
; AVX512-NEXT: vpshufb %ymm2, %ymm8, %ymm1
; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21
; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm17
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm10
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm0
; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm0
; AVX512-NEXT: vmovdqa64 %xmm1, %xmm28
; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10
; AVX512-NEXT: vmovdqa 32(%rsi), %xmm9
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm1
; AVX512-NEXT: vmovdqa64 %xmm2, %xmm29
; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm19
; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm18
; AVX512-NEXT: vmovdqa 32(%rcx), %ymm15
; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
; AVX512-NEXT: vpshufb %ymm8, %ymm15, %ymm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
; AVX512-NEXT: vpshufb %ymm5, %ymm15, %ymm0
; AVX512-NEXT: vmovdqa 32(%rdx), %ymm13
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
; AVX512-NEXT: vpshufb %ymm3, %ymm13, %ymm1
@@ -4698,16 +4698,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm1
; AVX512-NEXT: vmovdqa64 %xmm2, %xmm31
; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm23
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
; AVX512-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm14, %ymm5, %ymm0
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
; AVX512-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm12, %ymm8, %ymm0
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm1
; AVX512-NEXT: vpshufb %ymm4, %ymm8, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
; AVX512-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm5, %ymm11, %ymm1
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
; AVX512-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm8, %ymm11, %ymm1
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm2, %ymm11, %ymm11
@@ -4717,121 +4717,123 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufb %ymm11, %ymm13, %ymm1
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
; AVX512-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm9
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm25
; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm14
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm14, %zmm25
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm9
; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm14
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
; AVX512-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm15, %ymm13, %ymm13
; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm27
; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm27
; AVX512-NEXT: vmovdqa (%rcx), %ymm13
; AVX512-NEXT: vpshufb %ymm8, %ymm13, %ymm8
; AVX512-NEXT: vmovdqa (%rdx), %ymm9
; AVX512-NEXT: vpshufb %ymm3, %ymm9, %ymm3
; AVX512-NEXT: vporq %ymm8, %ymm3, %ymm16
; AVX512-NEXT: vpshufb %ymm5, %ymm13, %ymm5
; AVX512-NEXT: vmovdqa (%rdx), %ymm14
; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm3
; AVX512-NEXT: vporq %ymm5, %ymm3, %ymm16
; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm0
; AVX512-NEXT: vpshufb %ymm15, %ymm9, %ymm3
; AVX512-NEXT: vporq %ymm0, %ymm3, %ymm18
; AVX512-NEXT: vpshufb %ymm15, %ymm14, %ymm3
; AVX512-NEXT: vporq %ymm0, %ymm3, %ymm19
; AVX512-NEXT: vmovdqa (%rsi), %ymm3
; AVX512-NEXT: vmovdqa64 %ymm20, %ymm0
; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512-NEXT: vmovdqa (%rdi), %ymm8
; AVX512-NEXT: vmovdqa64 %ymm21, %ymm15
; AVX512-NEXT: vpshufb %ymm15, %ymm8, %ymm15
; AVX512-NEXT: vpor %ymm0, %ymm15, %ymm15
; AVX512-NEXT: vpshufb %ymm4, %ymm8, %ymm0
; AVX512-NEXT: vmovdqa (%rdi), %ymm15
; AVX512-NEXT: vmovdqa64 %ymm21, %ymm5
; AVX512-NEXT: vpshufb %ymm5, %ymm15, %ymm5
; AVX512-NEXT: vpor %ymm0, %ymm5, %ymm5
; AVX512-NEXT: vpshufb %ymm4, %ymm15, %ymm0
; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm2
; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm2
; AVX512-NEXT: vpshufb %ymm11, %ymm14, %ymm0
; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm1
; AVX512-NEXT: vporq %ymm2, %ymm1, %ymm20
; AVX512-NEXT: vpshufb %ymm14, %ymm8, %ymm1
; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm2
; AVX512-NEXT: vporq %ymm1, %ymm2, %ymm21
; AVX512-NEXT: vmovdqa (%rdi), %xmm5
; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm20
; AVX512-NEXT: vpshufb %ymm12, %ymm15, %ymm0
; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm1
; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm21
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa (%rsi), %xmm8
; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1
; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1
; AVX512-NEXT: vmovdqa64 %xmm29, %xmm2
; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm2
; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm3
; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vmovdqa64 %xmm29, %xmm3
; AVX512-NEXT: vpshufb %xmm3, %xmm8, %xmm3
; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm11
; AVX512-NEXT: vmovdqa (%rcx), %xmm1
; AVX512-NEXT: vmovdqa (%rdx), %xmm2
; AVX512-NEXT: vmovdqa (%rdx), %xmm3
; AVX512-NEXT: vmovdqa64 %xmm30, %xmm4
; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm4
; AVX512-NEXT: vmovdqa64 %xmm31, %xmm9
; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX512-NEXT: vpor %xmm4, %xmm9, %xmm9
; AVX512-NEXT: vmovdqa64 %xmm31, %xmm12
; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm12
; AVX512-NEXT: vpor %xmm4, %xmm12, %xmm12
; AVX512-NEXT: vmovdqa 32(%r8), %ymm4
; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4
; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = mem[1,1,2,2]
; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,1]
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
; AVX512-NEXT: vpandn %ymm13, %ymm14, %ymm13
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4
; AVX512-NEXT: vmovdqa (%r8), %ymm13
; AVX512-NEXT: vpshufb %ymm11, %ymm13, %ymm11
; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,1,1,4,6,5,5]
; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,3,2]
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11
; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
; AVX512-NEXT: vpshufb %ymm13, %ymm4, %ymm4
; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = mem[1,1,2,2]
; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,1]
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
; AVX512-NEXT: vpandn %ymm14, %ymm15, %ymm14
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm4
; AVX512-NEXT: vmovdqa (%r8), %ymm14
; AVX512-NEXT: vpshufb %ymm13, %ymm14, %ymm13
; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,2,1,1,4,6,5,5]
; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,3,2]
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm14, %ymm14
; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6
; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
; AVX512-NEXT: vpshufb %xmm12, %xmm10, %xmm10
; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
; AVX512-NEXT: vpshufb %xmm12, %xmm5, %xmm5
; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
; AVX512-NEXT: vpshufb %xmm10, %xmm9, %xmm9
; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0
; AVX512-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm0
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm1, %zmm1
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm19[0,0,1,1]
; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2
; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm23[0,0,1,1]
; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm5, %zmm5
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm7 & (zmm5 ^ zmm2))
; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & zmm14)
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm18[2,2,3,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm16, %zmm2
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0
; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm7 & (zmm0 ^ zmm2))
; AVX512-NEXT: vporq %zmm24, %zmm26, %zmm2
; AVX512-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,3,3,6,6,7,7]
; AVX512-NEXT: vporq %zmm25, %zmm27, %zmm5
; AVX512-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm12, %zmm1, %zmm1
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,0,1,1]
; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3
; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm23[0,0,1,1]
; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm7, %zmm7
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm3 ^ (zmm8 & (zmm7 ^ zmm3))
; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm7 & zmm15)
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm19[2,2,3,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm16, %zmm3
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2
; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm8 & (zmm2 ^ zmm3))
; AVX512-NEXT: vpermq {{.*#+}} zmm3 = zmm24[2,2,3,3,6,6,7,7]
; AVX512-NEXT: vpermq {{.*#+}} zmm5 = zmm26[2,2,3,3,6,6,7,7]
; AVX512-NEXT: vporq %zmm3, %zmm5, %zmm3
; AVX512-NEXT: vpermq {{.*#+}} zmm5 = zmm25[2,2,3,3,6,6,7,7]
; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm27[2,2,3,3,6,6,7,7]
; AVX512-NEXT: vporq %zmm5, %zmm7, %zmm5
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm7 & (zmm5 ^ zmm2))
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm20[2,2,3,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2
; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm7 & (zmm5 ^ zmm3))
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm20[2,2,3,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3
; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm21[2,2,3,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6
; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm7 & (zmm6 ^ zmm2))
; AVX512-NEXT: vmovdqa64 (%r8), %zmm2
; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6
; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (zmm7 & (zmm6 ^ zmm3))
; AVX512-NEXT: vmovdqa64 (%r8), %zmm3
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15]
; AVX512-NEXT: vpermd %zmm2, %zmm7, %zmm7
; AVX512-NEXT: vpermd %zmm3, %zmm7, %zmm7
; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5))
; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm0 & mem)
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
; AVX512-NEXT: vpermd %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6))
; AVX512-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5]
; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm2 & mem)
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
; AVX512-NEXT: vpermd %zmm3, %zmm2, %zmm2
; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm6))
; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5]
; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,1,4,4,5,5]
; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3))
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2
; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1))
; AVX512-NEXT: vmovdqa64 %zmm11, 64(%r9)
; AVX512-NEXT: vmovdqa64 %zmm2, (%r9)
; AVX512-NEXT: vmovdqa64 %zmm0, 128(%r9)
; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0))
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
; AVX512-NEXT: vpermd %zmm3, %zmm0, %zmm0
; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
; AVX512-NEXT: vmovdqa64 %zmm13, 64(%r9)
; AVX512-NEXT: vmovdqa64 %zmm0, (%r9)
; AVX512-NEXT: vmovdqa64 %zmm2, 128(%r9)
; AVX512-NEXT: vmovdqa64 %zmm7, 256(%r9)
; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r9)
; AVX512-NEXT: vzeroupper
@@ -5005,23 +5007,23 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm11, %ymm0
; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm20
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm8
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm1
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm8, %ymm1
; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21
; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm17
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm12
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm10
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm12, %xmm0
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm0
; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm28
; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10
; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm9
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm1
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm1
; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm29
; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm19
; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm18
; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm15
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
; AVX512DQ-NEXT: vpshufb %ymm8, %ymm15, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm15, %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm13
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm13, %ymm1
@@ -5035,16 +5037,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm1
; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm31
; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm23
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
; AVX512DQ-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm14, %ymm5, %ymm0
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
; AVX512DQ-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm12, %ymm8, %ymm0
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm1
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm1
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm11, %ymm1
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm8, %ymm11, %ymm1
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm11, %ymm11
@@ -5054,121 +5056,123 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufb %ymm11, %ymm13, %ymm1
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm9
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm25
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm14
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm14, %zmm25
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm9
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm14
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
; AVX512DQ-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm15, %ymm13, %ymm13
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm27
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm27
; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm13
; AVX512DQ-NEXT: vpshufb %ymm8, %ymm13, %ymm8
; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm9
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm9, %ymm3
; AVX512DQ-NEXT: vporq %ymm8, %ymm3, %ymm16
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm13, %ymm5
; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm14
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm3
; AVX512DQ-NEXT: vporq %ymm5, %ymm3, %ymm16
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm15, %ymm9, %ymm3
; AVX512DQ-NEXT: vporq %ymm0, %ymm3, %ymm18
; AVX512DQ-NEXT: vpshufb %ymm15, %ymm14, %ymm3
; AVX512DQ-NEXT: vporq %ymm0, %ymm3, %ymm19
; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3
; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8
; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm15
; AVX512DQ-NEXT: vpshufb %ymm15, %ymm8, %ymm15
; AVX512DQ-NEXT: vpor %ymm0, %ymm15, %ymm15
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm15
; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm5
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm15, %ymm5
; AVX512DQ-NEXT: vpor %ymm0, %ymm5, %ymm5
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm15, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm2
; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm2
; AVX512DQ-NEXT: vpshufb %ymm11, %ymm14, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm1
; AVX512DQ-NEXT: vporq %ymm2, %ymm1, %ymm20
; AVX512DQ-NEXT: vpshufb %ymm14, %ymm8, %ymm1
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm2
; AVX512DQ-NEXT: vporq %ymm1, %ymm2, %ymm21
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5
; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm20
; AVX512DQ-NEXT: vpshufb %ymm12, %ymm15, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm1
; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm21
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8
; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm1
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm1
; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm2
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm2
; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm3
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm3
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm3
; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm11
; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3
; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm4
; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm4
; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm9
; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX512DQ-NEXT: vpor %xmm4, %xmm9, %xmm9
; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm12
; AVX512DQ-NEXT: vpshufb %xmm12, %xmm3, %xmm12
; AVX512DQ-NEXT: vpor %xmm4, %xmm12, %xmm12
; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = mem[1,1,2,2]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,1]
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
; AVX512DQ-NEXT: vpandn %ymm13, %ymm14, %ymm13
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4
; AVX512DQ-NEXT: vmovdqa (%r8), %ymm13
; AVX512DQ-NEXT: vpshufb %ymm11, %ymm13, %ymm11
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,1,1,4,6,5,5]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,3,2]
; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
; AVX512DQ-NEXT: vpshufb %ymm13, %ymm4, %ymm4
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = mem[1,1,2,2]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,1]
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
; AVX512DQ-NEXT: vpandn %ymm14, %ymm15, %ymm14
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm4
; AVX512DQ-NEXT: vmovdqa (%r8), %ymm14
; AVX512DQ-NEXT: vpshufb %ymm13, %ymm14, %ymm13
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,2,1,1,4,6,5,5]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,3,2]
; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm14, %ymm14
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
; AVX512DQ-NEXT: vpshufb %xmm12, %xmm10, %xmm10
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
; AVX512DQ-NEXT: vpshufb %xmm12, %xmm5, %xmm5
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
; AVX512DQ-NEXT: vpshufb %xmm10, %xmm9, %xmm9
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
; AVX512DQ-NEXT: vpshufb %xmm10, %xmm0, %xmm0
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm0
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm9, %zmm1, %zmm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm19[0,0,1,1]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm23[0,0,1,1]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm5, %zmm5
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm7 & (zmm5 ^ zmm2))
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & zmm14)
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm18[2,2,3,3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm16, %zmm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm7 & (zmm0 ^ zmm2))
; AVX512DQ-NEXT: vporq %zmm24, %zmm26, %zmm2
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,3,3,6,6,7,7]
; AVX512DQ-NEXT: vporq %zmm25, %zmm27, %zmm5
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm12, %zmm1, %zmm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,0,1,1]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm23[0,0,1,1]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm7, %zmm7
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm3 ^ (zmm8 & (zmm7 ^ zmm3))
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm7 & zmm15)
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm19[2,2,3,3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm16, %zmm3
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm8 & (zmm2 ^ zmm3))
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm3 = zmm24[2,2,3,3,6,6,7,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm5 = zmm26[2,2,3,3,6,6,7,7]
; AVX512DQ-NEXT: vporq %zmm3, %zmm5, %zmm3
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm5 = zmm25[2,2,3,3,6,6,7,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm27[2,2,3,3,6,6,7,7]
; AVX512DQ-NEXT: vporq %zmm5, %zmm7, %zmm5
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm7 & (zmm5 ^ zmm2))
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm20[2,2,3,3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm7 & (zmm5 ^ zmm3))
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm20[2,2,3,3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm21[2,2,3,3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm7 & (zmm6 ^ zmm2))
; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm2
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (zmm7 & (zmm6 ^ zmm3))
; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm3
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15]
; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm7
; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5))
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm0 & mem)
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
; AVX512DQ-NEXT: vpermd %zmm2, %zmm0, %zmm0
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6))
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm2 & mem)
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm6))
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5]
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,1,4,4,5,5]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3))
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm2
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1))
; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%r9)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%r9)
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0))
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
; AVX512DQ-NEXT: vpermd %zmm3, %zmm0, %zmm0
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%r9)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%r9)
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%r9)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r9)
; AVX512DQ-NEXT: vzeroupper

View File

@@ -6388,7 +6388,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: kmovd %r10d, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1}
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm7[4,5,6,7,4,5,6,7]
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
; AVX512BW-NEXT: vpshufb %zmm14, %zmm9, %zmm9
; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7]
; AVX512BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492
; AVX512BW-NEXT: kmovd %r10d, %k2
@@ -6420,7 +6421,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm5 {%k1}
; AVX512BW-NEXT: vmovdqa (%r8), %ymm13
; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
; AVX512BW-NEXT: vpshufb %zmm14, %zmm7, %zmm7
; AVX512BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7]
; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm5 {%k2}
; AVX512BW-NEXT: vmovdqa (%r9), %ymm12
@@ -6545,184 +6546,185 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512BW-FCP-LABEL: store_i8_stride6_vf64:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm9
; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm1
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm10
; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm2
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
; AVX512BW-FCP-NEXT: vpermw %ymm3, %ymm0, %ymm3
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3
; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm4
; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm5
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm6
; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm7
; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm18
; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm21
; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm8
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm30
; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm24
; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm24, %xmm11
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
; AVX512BW-FCP-NEXT: vpermt2w %zmm8, %zmm13, %zmm11
; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm25
; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm19
; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm19, %xmm8
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm26
; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22
; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm22, %xmm12
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm16, %zmm8
; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %xmm29
; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm20
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm17 = [8,9,0,0,0,5,6,7]
; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm20, %xmm14
; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm20[0],zero,xmm20[1],zero,xmm20[2],zero,xmm20[3],zero,xmm20[4],zero,xmm20[5],zero,xmm20[6],zero,xmm20[7],zero
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm27 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm27, %zmm12
; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %xmm28
; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm23
; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm14
; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm30, %xmm31
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm31[0],xmm14[0],xmm31[1],xmm14[1],xmm31[2],xmm14[2],xmm31[3],xmm14[3],xmm31[4],xmm14[4],xmm31[5],xmm14[5],xmm31[6],xmm14[6],xmm31[7],xmm14[7]
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm30[0],xmm18[0],xmm30[1],xmm18[1],xmm30[2],xmm18[2],xmm30[3],xmm18[3],xmm30[4],xmm18[4],xmm30[5],xmm18[5],xmm30[6],xmm18[6],xmm30[7],xmm18[7]
; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm13, %zmm14
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm31 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm25, %xmm13
; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm26, %xmm15
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm16, %zmm13
; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm23, %xmm16
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm23[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm27, %zmm15
; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm29, %xmm17
; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm16 = xmm29[0],zero,xmm29[1],zero,xmm29[2],zero,xmm29[3],zero,xmm29[4],zero,xmm29[5],zero,xmm29[6],zero,xmm29[7],zero
; AVX512BW-FCP-NEXT: vpermt2w %zmm17, %zmm27, %zmm16
; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm28, %xmm31
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm27, %zmm17
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm27
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm27[0],ymm7[0],ymm27[1],ymm7[1],ymm27[2],ymm7[2],ymm27[3],ymm7[3],ymm27[4],ymm7[4],ymm27[5],ymm7[5],ymm27[6],ymm7[6],ymm27[7],ymm7[7],ymm27[16],ymm7[16],ymm27[17],ymm7[17],ymm27[18],ymm7[18],ymm27[19],ymm7[19],ymm27[20],ymm7[20],ymm27[21],ymm7[21],ymm27[22],ymm7[22],ymm27[23],ymm7[23]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm18[8],xmm30[9],xmm18[9],xmm30[10],xmm18[10],xmm30[11],xmm18[11],xmm30[12],xmm18[12],xmm30[13],xmm18[13],xmm30[14],xmm18[14],xmm30[15],xmm18[15]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm18
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm30
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15]
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm24 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23]
; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm0, %zmm21
; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %ymm31
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm30[0],ymm6[0],ymm30[1],ymm6[1],ymm30[2],ymm6[2],ymm30[3],ymm6[3],ymm30[4],ymm6[4],ymm30[5],ymm6[5],ymm30[6],ymm6[6],ymm30[7],ymm6[7],ymm30[16],ymm6[16],ymm30[17],ymm6[17],ymm30[18],ymm6[18],ymm30[19],ymm6[19],ymm30[20],ymm6[20],ymm30[21],ymm6[21],ymm30[22],ymm6[22],ymm30[23],ymm6[23]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm26 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm26, %zmm24
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm29[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm31, %ymm29
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm0, %zmm25
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15]
; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %ymm22
; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm26, %zmm9
; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
; AVX512BW-FCP-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm22, %ymm19
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm0, %zmm10
; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm19
; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm19, %ymm1
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm19
; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm1
; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm1, %ymm1
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm23[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm20
; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0
; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm1
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1
; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm6
; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm0
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm2
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15],ymm5[24],ymm4[24],ymm5[25],ymm4[25],ymm5[26],ymm4[26],ymm5[27],ymm4[27],ymm5[28],ymm4[28],ymm5[29],ymm4[29],ymm5[30],ymm4[30],ymm5[31],ymm4[31]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
; AVX512BW-FCP-NEXT: vpermw %ymm4, %ymm5, %ymm4
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
; AVX512BW-FCP-NEXT: vpermw %ymm3, %ymm1, %ymm3
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5
; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm10
; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm0
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm11
; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm4
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
; AVX512BW-FCP-NEXT: vpermw %ymm8, %ymm4, %ymm8
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
; AVX512BW-FCP-NEXT: movl $613566756, %eax # imm = 0x24924924
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1}
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[4,5,6,7,4,5,6,7]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7]
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1}
; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm5
; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm23
; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm17
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm17, %xmm8
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm25
; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm21
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm21, %xmm9
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm21[0],xmm17[0],xmm21[1],xmm17[1],xmm21[2],xmm17[2],xmm21[3],xmm17[3],xmm21[4],xmm17[4],xmm21[5],xmm17[5],xmm21[6],xmm17[6],xmm21[7],xmm17[7]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
; AVX512BW-FCP-NEXT: vpermt2w %zmm8, %zmm13, %zmm9
; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm30
; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm18
; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm18, %xmm8
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm31
; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm24
; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm24, %xmm12
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm24[0],xmm18[0],xmm24[1],xmm18[1],xmm24[2],xmm18[2],xmm24[3],xmm18[3],xmm24[4],xmm18[4],xmm24[5],xmm18[5],xmm24[6],xmm18[6],xmm24[7],xmm18[7]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm16, %zmm8
; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %xmm26
; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm20
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm19 = [8,9,0,0,0,5,6,7]
; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm20, %xmm14
; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm20[0],zero,xmm20[1],zero,xmm20[2],zero,xmm20[3],zero,xmm20[4],zero,xmm20[5],zero,xmm20[6],zero,xmm20[7],zero
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm28, %zmm12
; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %xmm27
; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm22
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm23, %xmm14
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm25, %xmm29
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm29[0],xmm14[0],xmm29[1],xmm14[1],xmm29[2],xmm14[2],xmm29[3],xmm14[3],xmm29[4],xmm14[4],xmm29[5],xmm14[5],xmm29[6],xmm14[6],xmm29[7],xmm14[7]
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm25[0],xmm23[0],xmm25[1],xmm23[1],xmm25[2],xmm23[2],xmm25[3],xmm23[3],xmm25[4],xmm23[4],xmm25[5],xmm23[5],xmm25[6],xmm23[6],xmm25[7],xmm23[7]
; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm13, %zmm14
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm30, %xmm13
; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm31, %xmm15
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm31[0],xmm30[0],xmm31[1],xmm30[1],xmm31[2],xmm30[2],xmm31[3],xmm30[3],xmm31[4],xmm30[4],xmm31[5],xmm30[5],xmm31[6],xmm30[6],xmm31[7],xmm30[7]
; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm16, %zmm13
; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm22, %xmm16
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm22[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm28, %zmm15
; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm26, %xmm19
; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm16 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero,xmm26[4],zero,xmm26[5],zero,xmm26[6],zero,xmm26[7],zero
; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm28, %zmm16
; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm27, %xmm29
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm27[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm28, %zmm19
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm28
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm29 = ymm28[0],ymm5[0],ymm28[1],ymm5[1],ymm28[2],ymm5[2],ymm28[3],ymm5[3],ymm28[4],ymm5[4],ymm28[5],ymm5[5],ymm28[6],ymm5[6],ymm28[7],ymm5[7],ymm28[16],ymm5[16],ymm28[17],ymm5[17],ymm28[18],ymm5[18],ymm28[19],ymm5[19],ymm28[20],ymm5[20],ymm28[21],ymm5[21],ymm28[22],ymm5[22],ymm28[23],ymm5[23]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm1, %zmm25
; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm29
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm31[8],xmm30[8],xmm31[9],xmm30[9],xmm31[10],xmm30[10],xmm31[11],xmm30[11],xmm31[12],xmm30[12],xmm31[13],xmm30[13],xmm31[14],xmm30[14],xmm31[15],xmm30[15]
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm30
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm30[0],ymm29[0],ymm30[1],ymm29[1],ymm30[2],ymm29[2],ymm30[3],ymm29[3],ymm30[4],ymm29[4],ymm30[5],ymm29[5],ymm30[6],ymm29[6],ymm30[7],ymm29[7],ymm30[16],ymm29[16],ymm30[17],ymm29[17],ymm30[18],ymm29[18],ymm30[19],ymm29[19],ymm30[20],ymm29[20],ymm30[21],ymm29[21],ymm30[22],ymm29[22],ymm30[23],ymm29[23]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm2, %zmm23
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm21[8],xmm17[8],xmm21[9],xmm17[9],xmm21[10],xmm17[10],xmm21[11],xmm17[11],xmm21[12],xmm17[12],xmm21[13],xmm17[13],xmm21[14],xmm17[14],xmm21[15],xmm17[15]
; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %ymm17
; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm1, %zmm10
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm17, %ymm21
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm26[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm26 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
; AVX512BW-FCP-NEXT: vpermt2w %zmm21, %zmm26, %zmm11
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm24[8],xmm18[8],xmm24[9],xmm18[9],xmm24[10],xmm18[10],xmm24[11],xmm18[11],xmm24[12],xmm18[12],xmm24[13],xmm18[13],xmm24[14],xmm18[14],xmm24[15],xmm18[15]
; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %ymm21
; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm2, %zmm6
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm21, %ymm18
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm27[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-FCP-NEXT: vpermt2w %zmm18, %zmm26, %zmm7
; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm18
; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm1
; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm24
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm20[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm26, %zmm18
; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm1
; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm26, %zmm20
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm24[4,5,6,7,4,5,6,7]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
; AVX512BW-FCP-NEXT: vpshufb %zmm22, %zmm1, %zmm1
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
; AVX512BW-FCP-NEXT: movl $-1840700270, %eax # imm = 0x92492492
; AVX512BW-FCP-NEXT: kmovd %eax, %k2
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k2}
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[4,5,6,7,4,5,6,7]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7]
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[4,5,6,7,4,5,6,7]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
; AVX512BW-FCP-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208
; AVX512BW-FCP-NEXT: kmovq %rax, %k3
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm4 {%k3}
; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm6, %ymm3
; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm23
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm23[0],ymm3[0],ymm23[1],ymm3[1],ymm23[2],ymm3[2],ymm23[3],ymm3[3],ymm23[4],ymm3[4],ymm23[5],ymm3[5],ymm23[6],ymm3[6],ymm23[7],ymm3[7],ymm23[16],ymm3[16],ymm23[17],ymm3[17],ymm23[18],ymm3[18],ymm23[19],ymm3[19],ymm23[20],ymm3[20],ymm23[21],ymm3[21],ymm23[22],ymm3[22],ymm23[23],ymm3[23]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm30[8],ymm6[8],ymm30[9],ymm6[9],ymm30[10],ymm6[10],ymm30[11],ymm6[11],ymm30[12],ymm6[12],ymm30[13],ymm6[13],ymm30[14],ymm6[14],ymm30[15],ymm6[15],ymm30[24],ymm6[24],ymm30[25],ymm6[25],ymm30[26],ymm6[26],ymm30[27],ymm6[27],ymm30[28],ymm6[28],ymm30[29],ymm6[29],ymm30[30],ymm6[30],ymm30[31],ymm6[31]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm23, %ymm6
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3}
; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm26 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm29, %ymm1
; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm30, %ymm26
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm26[0],ymm1[0],ymm26[1],ymm1[1],ymm26[2],ymm1[2],ymm26[3],ymm1[3],ymm26[4],ymm1[4],ymm26[5],ymm1[5],ymm26[6],ymm1[6],ymm26[7],ymm1[7],ymm26[16],ymm1[16],ymm26[17],ymm1[17],ymm26[18],ymm1[18],ymm26[19],ymm1[19],ymm26[20],ymm1[20],ymm26[21],ymm1[21],ymm26[22],ymm1[22],ymm26[23],ymm1[23]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm26 = ymm30[8],ymm29[8],ymm30[9],ymm29[9],ymm30[10],ymm29[10],ymm30[11],ymm29[11],ymm30[12],ymm29[12],ymm30[13],ymm29[13],ymm30[14],ymm29[14],ymm30[15],ymm29[15],ymm30[24],ymm29[24],ymm30[25],ymm29[25],ymm30[26],ymm29[26],ymm30[27],ymm29[27],ymm30[28],ymm29[28],ymm30[29],ymm29[29],ymm30[30],ymm29[30],ymm30[31],ymm29[31]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm27 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
; AVX512BW-FCP-NEXT: vpermw %ymm26, %ymm27, %ymm26
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm1, %zmm1
; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm26
; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm28, %ymm3
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm26[0],ymm3[1],ymm26[1],ymm3[2],ymm26[2],ymm3[3],ymm26[3],ymm3[4],ymm26[4],ymm3[5],ymm26[5],ymm3[6],ymm26[6],ymm3[7],ymm26[7],ymm3[16],ymm26[16],ymm3[17],ymm26[17],ymm3[18],ymm26[18],ymm3[19],ymm26[19],ymm3[20],ymm26[20],ymm3[21],ymm26[21],ymm3[22],ymm26[22],ymm3[23],ymm26[23]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm28[8],ymm5[8],ymm28[9],ymm5[9],ymm28[10],ymm5[10],ymm28[11],ymm5[11],ymm28[12],ymm5[12],ymm28[13],ymm5[13],ymm28[14],ymm5[14],ymm28[15],ymm5[15],ymm28[24],ymm5[24],ymm28[25],ymm5[25],ymm28[26],ymm5[26],ymm28[27],ymm5[27],ymm28[28],ymm5[28],ymm28[29],ymm5[29],ymm28[30],ymm5[30],ymm28[31],ymm5[31]
; AVX512BW-FCP-NEXT: vpermw %ymm5, %ymm4, %ymm4
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3
; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm6
; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm27, %ymm2
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[16],ymm6[16],ymm2[17],ymm6[17],ymm2[18],ymm6[18],ymm2[19],ymm6[19],ymm2[20],ymm6[20],ymm2[21],ymm6[21],ymm2[22],ymm6[22],ymm2[23],ymm6[23]
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm27[8],ymm7[8],ymm27[9],ymm7[9],ymm27[10],ymm7[10],ymm27[11],ymm7[11],ymm27[12],ymm7[12],ymm27[13],ymm7[13],ymm27[14],ymm7[14],ymm27[15],ymm7[15],ymm27[24],ymm7[24],ymm27[25],ymm7[25],ymm27[26],ymm7[26],ymm27[27],ymm7[27],ymm27[28],ymm7[28],ymm27[29],ymm7[29],ymm27[30],ymm7[30],ymm27[31],ymm7[31]
; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1}
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm1, %zmm1
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k1}
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm24, %zmm1
; AVX512BW-FCP-NEXT: vpshufb %zmm22, %zmm1, %zmm1
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2}
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm8 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2}
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm2, %zmm1
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm3 {%k3}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm8 {%k1}
; AVX512BW-FCP-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820
; AVX512BW-FCP-NEXT: kmovq %rax, %k3
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm8 {%k3}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm13 {%k1}
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm13 {%k3}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm24 {%k1}
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm13 {%k3}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm23 {%k1}
; AVX512BW-FCP-NEXT: movl $1227133513, %eax # imm = 0x49249249
; AVX512BW-FCP-NEXT: kmovd %eax, %k2
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm24 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm23 {%k2}
; AVX512BW-FCP-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082
; AVX512BW-FCP-NEXT: kmovq %rax, %k3
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm24 {%k3}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm9 {%k1}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm9 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm9 {%k3}
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm23 {%k3}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm6 {%k1}
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm6 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm6 {%k3}
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rax)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%rax)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rax)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -6757,7 +6759,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: kmovd %r10d, %k1
; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1}
; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm7[4,5,6,7,4,5,6,7]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
; AVX512DQ-BW-NEXT: vpshufb %zmm14, %zmm9, %zmm9
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7]
; AVX512DQ-BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492
; AVX512DQ-BW-NEXT: kmovd %r10d, %k2
@@ -6789,7 +6792,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm5 {%k1}
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm13
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
; AVX512DQ-BW-NEXT: vpshufb %zmm14, %zmm7, %zmm7
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7]
; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm5 {%k2}
; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm12
@@ -6914,184 +6917,185 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-BW-FCP-LABEL: store_i8_stride6_vf64:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm9
; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm10
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm2
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm3, %ymm0, %ymm3
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm4
; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm5
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm6
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm7
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm18
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm21
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm8
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm30
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm24
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm24, %xmm11
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm8, %zmm13, %zmm11
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm25
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm19
; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm19, %xmm8
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm26
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm22, %xmm12
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm16, %zmm8
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %xmm29
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm20
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm17 = [8,9,0,0,0,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm20, %xmm14
; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm20[0],zero,xmm20[1],zero,xmm20[2],zero,xmm20[3],zero,xmm20[4],zero,xmm20[5],zero,xmm20[6],zero,xmm20[7],zero
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm27 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm27, %zmm12
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %xmm28
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm23
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm14
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm30, %xmm31
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm31[0],xmm14[0],xmm31[1],xmm14[1],xmm31[2],xmm14[2],xmm31[3],xmm14[3],xmm31[4],xmm14[4],xmm31[5],xmm14[5],xmm31[6],xmm14[6],xmm31[7],xmm14[7]
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm30[0],xmm18[0],xmm30[1],xmm18[1],xmm30[2],xmm18[2],xmm30[3],xmm18[3],xmm30[4],xmm18[4],xmm30[5],xmm18[5],xmm30[6],xmm18[6],xmm30[7],xmm18[7]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm13, %zmm14
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm31 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm25, %xmm13
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm26, %xmm15
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm16, %zmm13
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm23, %xmm16
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm23[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm27, %zmm15
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm29, %xmm17
; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm16 = xmm29[0],zero,xmm29[1],zero,xmm29[2],zero,xmm29[3],zero,xmm29[4],zero,xmm29[5],zero,xmm29[6],zero,xmm29[7],zero
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm17, %zmm27, %zmm16
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm28, %xmm31
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm27, %zmm17
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm27
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm27[0],ymm7[0],ymm27[1],ymm7[1],ymm27[2],ymm7[2],ymm27[3],ymm7[3],ymm27[4],ymm7[4],ymm27[5],ymm7[5],ymm27[6],ymm7[6],ymm27[7],ymm7[7],ymm27[16],ymm7[16],ymm27[17],ymm7[17],ymm27[18],ymm7[18],ymm27[19],ymm7[19],ymm27[20],ymm7[20],ymm27[21],ymm7[21],ymm27[22],ymm7[22],ymm27[23],ymm7[23]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm18[8],xmm30[9],xmm18[9],xmm30[10],xmm18[10],xmm30[11],xmm18[11],xmm30[12],xmm18[12],xmm30[13],xmm18[13],xmm30[14],xmm18[14],xmm30[15],xmm18[15]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm18
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm30
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15]
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm24 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm0, %zmm21
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %ymm31
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm30[0],ymm6[0],ymm30[1],ymm6[1],ymm30[2],ymm6[2],ymm30[3],ymm6[3],ymm30[4],ymm6[4],ymm30[5],ymm6[5],ymm30[6],ymm6[6],ymm30[7],ymm6[7],ymm30[16],ymm6[16],ymm30[17],ymm6[17],ymm30[18],ymm6[18],ymm30[19],ymm6[19],ymm30[20],ymm6[20],ymm30[21],ymm6[21],ymm30[22],ymm6[22],ymm30[23],ymm6[23]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm26 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm26, %zmm24
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm29[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm31, %ymm29
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm0, %zmm25
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm22
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm26, %zmm9
; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
; AVX512DQ-BW-FCP-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3]
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm22, %ymm19
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm0, %zmm10
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm19
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm19, %ymm1
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm19
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm1
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm1, %ymm1
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm23[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm20
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm1
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm6
; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm2
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15],ymm5[24],ymm4[24],ymm5[25],ymm4[25],ymm5[26],ymm4[26],ymm5[27],ymm4[27],ymm5[28],ymm4[28],ymm5[29],ymm4[29],ymm5[30],ymm4[30],ymm5[31],ymm4[31]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm4, %ymm5, %ymm4
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm3, %ymm1, %ymm3
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm10
; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm11
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm4
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm8, %ymm4, %ymm8
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
; AVX512DQ-BW-FCP-NEXT: movl $613566756, %eax # imm = 0x24924924
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1}
; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[4,5,6,7,4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm5
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm23
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm17
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm17, %xmm8
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm25
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm21
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm21, %xmm9
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm21[0],xmm17[0],xmm21[1],xmm17[1],xmm21[2],xmm17[2],xmm21[3],xmm17[3],xmm21[4],xmm17[4],xmm21[5],xmm17[5],xmm21[6],xmm17[6],xmm21[7],xmm17[7]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm8, %zmm13, %zmm9
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm30
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm18
; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm18, %xmm8
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm31
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm24
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm24, %xmm12
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm24[0],xmm18[0],xmm24[1],xmm18[1],xmm24[2],xmm18[2],xmm24[3],xmm18[3],xmm24[4],xmm18[4],xmm24[5],xmm18[5],xmm24[6],xmm18[6],xmm24[7],xmm18[7]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm16, %zmm8
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %xmm26
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm20
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm19 = [8,9,0,0,0,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm20, %xmm14
; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm20[0],zero,xmm20[1],zero,xmm20[2],zero,xmm20[3],zero,xmm20[4],zero,xmm20[5],zero,xmm20[6],zero,xmm20[7],zero
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm28, %zmm12
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %xmm27
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm22
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm23, %xmm14
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm25, %xmm29
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm29[0],xmm14[0],xmm29[1],xmm14[1],xmm29[2],xmm14[2],xmm29[3],xmm14[3],xmm29[4],xmm14[4],xmm29[5],xmm14[5],xmm29[6],xmm14[6],xmm29[7],xmm14[7]
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm25[0],xmm23[0],xmm25[1],xmm23[1],xmm25[2],xmm23[2],xmm25[3],xmm23[3],xmm25[4],xmm23[4],xmm25[5],xmm23[5],xmm25[6],xmm23[6],xmm25[7],xmm23[7]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm13, %zmm14
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm30, %xmm13
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm31, %xmm15
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm31[0],xmm30[0],xmm31[1],xmm30[1],xmm31[2],xmm30[2],xmm31[3],xmm30[3],xmm31[4],xmm30[4],xmm31[5],xmm30[5],xmm31[6],xmm30[6],xmm31[7],xmm30[7]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm16, %zmm13
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm22, %xmm16
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm22[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm28, %zmm15
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm26, %xmm19
; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm16 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero,xmm26[4],zero,xmm26[5],zero,xmm26[6],zero,xmm26[7],zero
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm28, %zmm16
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm27, %xmm29
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm27[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm28, %zmm19
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm28
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm29 = ymm28[0],ymm5[0],ymm28[1],ymm5[1],ymm28[2],ymm5[2],ymm28[3],ymm5[3],ymm28[4],ymm5[4],ymm28[5],ymm5[5],ymm28[6],ymm5[6],ymm28[7],ymm5[7],ymm28[16],ymm5[16],ymm28[17],ymm5[17],ymm28[18],ymm5[18],ymm28[19],ymm5[19],ymm28[20],ymm5[20],ymm28[21],ymm5[21],ymm28[22],ymm5[22],ymm28[23],ymm5[23]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm1, %zmm25
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm29
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm31[8],xmm30[8],xmm31[9],xmm30[9],xmm31[10],xmm30[10],xmm31[11],xmm30[11],xmm31[12],xmm30[12],xmm31[13],xmm30[13],xmm31[14],xmm30[14],xmm31[15],xmm30[15]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm30
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm30[0],ymm29[0],ymm30[1],ymm29[1],ymm30[2],ymm29[2],ymm30[3],ymm29[3],ymm30[4],ymm29[4],ymm30[5],ymm29[5],ymm30[6],ymm29[6],ymm30[7],ymm29[7],ymm30[16],ymm29[16],ymm30[17],ymm29[17],ymm30[18],ymm29[18],ymm30[19],ymm29[19],ymm30[20],ymm29[20],ymm30[21],ymm29[21],ymm30[22],ymm29[22],ymm30[23],ymm29[23]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm2, %zmm23
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm21[8],xmm17[8],xmm21[9],xmm17[9],xmm21[10],xmm17[10],xmm21[11],xmm17[11],xmm21[12],xmm17[12],xmm21[13],xmm17[13],xmm21[14],xmm17[14],xmm21[15],xmm17[15]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %ymm17
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm1, %zmm10
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm17, %ymm21
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm26[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm26 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm21, %zmm26, %zmm11
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm24[8],xmm18[8],xmm24[9],xmm18[9],xmm24[10],xmm18[10],xmm24[11],xmm18[11],xmm24[12],xmm18[12],xmm24[13],xmm18[13],xmm24[14],xmm18[14],xmm24[15],xmm18[15]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm21
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm2, %zmm6
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm21, %ymm18
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm27[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm18, %zmm26, %zmm7
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm18
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm24
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm20[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm26, %zmm18
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm1
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm26, %zmm20
; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm24[4,5,6,7,4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm22, %zmm1, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %eax # imm = 0x92492492
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k2}
; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[4,5,6,7,4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[4,5,6,7,4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
; AVX512DQ-BW-FCP-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm4 {%k3}
; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm6, %ymm3
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm23
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm23[0],ymm3[0],ymm23[1],ymm3[1],ymm23[2],ymm3[2],ymm23[3],ymm3[3],ymm23[4],ymm3[4],ymm23[5],ymm3[5],ymm23[6],ymm3[6],ymm23[7],ymm3[7],ymm23[16],ymm3[16],ymm23[17],ymm3[17],ymm23[18],ymm3[18],ymm23[19],ymm3[19],ymm23[20],ymm3[20],ymm23[21],ymm3[21],ymm23[22],ymm3[22],ymm23[23],ymm3[23]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm30[8],ymm6[8],ymm30[9],ymm6[9],ymm30[10],ymm6[10],ymm30[11],ymm6[11],ymm30[12],ymm6[12],ymm30[13],ymm6[13],ymm30[14],ymm6[14],ymm30[15],ymm6[15],ymm30[24],ymm6[24],ymm30[25],ymm6[25],ymm30[26],ymm6[26],ymm30[27],ymm6[27],ymm30[28],ymm6[28],ymm30[29],ymm6[29],ymm30[30],ymm6[30],ymm30[31],ymm6[31]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm23, %ymm6
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3}
; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm26 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm29, %ymm1
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm30, %ymm26
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm26[0],ymm1[0],ymm26[1],ymm1[1],ymm26[2],ymm1[2],ymm26[3],ymm1[3],ymm26[4],ymm1[4],ymm26[5],ymm1[5],ymm26[6],ymm1[6],ymm26[7],ymm1[7],ymm26[16],ymm1[16],ymm26[17],ymm1[17],ymm26[18],ymm1[18],ymm26[19],ymm1[19],ymm26[20],ymm1[20],ymm26[21],ymm1[21],ymm26[22],ymm1[22],ymm26[23],ymm1[23]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm26 = ymm30[8],ymm29[8],ymm30[9],ymm29[9],ymm30[10],ymm29[10],ymm30[11],ymm29[11],ymm30[12],ymm29[12],ymm30[13],ymm29[13],ymm30[14],ymm29[14],ymm30[15],ymm29[15],ymm30[24],ymm29[24],ymm30[25],ymm29[25],ymm30[26],ymm29[26],ymm30[27],ymm29[27],ymm30[28],ymm29[28],ymm30[29],ymm29[29],ymm30[30],ymm29[30],ymm30[31],ymm29[31]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm27 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm26, %ymm27, %ymm26
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm1, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm26
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm28, %ymm3
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm26[0],ymm3[1],ymm26[1],ymm3[2],ymm26[2],ymm3[3],ymm26[3],ymm3[4],ymm26[4],ymm3[5],ymm26[5],ymm3[6],ymm26[6],ymm3[7],ymm26[7],ymm3[16],ymm26[16],ymm3[17],ymm26[17],ymm3[18],ymm26[18],ymm3[19],ymm26[19],ymm3[20],ymm26[20],ymm3[21],ymm26[21],ymm3[22],ymm26[22],ymm3[23],ymm26[23]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm28[8],ymm5[8],ymm28[9],ymm5[9],ymm28[10],ymm5[10],ymm28[11],ymm5[11],ymm28[12],ymm5[12],ymm28[13],ymm5[13],ymm28[14],ymm5[14],ymm28[15],ymm5[15],ymm28[24],ymm5[24],ymm28[25],ymm5[25],ymm28[26],ymm5[26],ymm28[27],ymm5[27],ymm28[28],ymm5[28],ymm28[29],ymm5[29],ymm28[30],ymm5[30],ymm28[31],ymm5[31]
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm5, %ymm4, %ymm4
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm6
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm27, %ymm2
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[16],ymm6[16],ymm2[17],ymm6[17],ymm2[18],ymm6[18],ymm2[19],ymm6[19],ymm2[20],ymm6[20],ymm2[21],ymm6[21],ymm2[22],ymm6[22],ymm2[23],ymm6[23]
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm27[8],ymm7[8],ymm27[9],ymm7[9],ymm27[10],ymm7[10],ymm27[11],ymm7[11],ymm27[12],ymm7[12],ymm27[13],ymm7[13],ymm27[14],ymm7[14],ymm27[15],ymm7[15],ymm27[24],ymm7[24],ymm27[25],ymm7[25],ymm27[26],ymm7[26],ymm27[27],ymm7[27],ymm27[28],ymm7[28],ymm27[29],ymm7[29],ymm27[30],ymm7[30],ymm27[31],ymm7[31]
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1}
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm1, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k1}
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm24, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm22, %zmm1, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2}
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm8 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2}
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm2, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm3 {%k3}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm8 {%k1}
; AVX512DQ-BW-FCP-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm8 {%k3}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm13 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm13 {%k3}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm24 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm13 {%k3}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm23 {%k1}
; AVX512DQ-BW-FCP-NEXT: movl $1227133513, %eax # imm = 0x49249249
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm24 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm23 {%k2}
; AVX512DQ-BW-FCP-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm24 {%k3}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm9 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm9 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm9 {%k3}
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm23 {%k3}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm6 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm6 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm6 {%k3}
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64

File diff suppressed because it is too large Load Diff

View File

@@ -1487,81 +1487,79 @@ define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64>
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VL-NEXT: vpand %ymm0, %ymm8, %ymm0
; AVX512VL-NEXT: vpshufb %ymm0, %ymm10, %ymm0
; AVX512VL-NEXT: vpaddb %ymm0, %ymm9, %ymm9
; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm9, %ymm9
; AVX512VL-NEXT: vpaddb %ymm0, %ymm9, %ymm0
; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm9, %ymm0
; AVX512VL-NEXT: vpand %ymm1, %ymm8, %ymm11
; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11
; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512VL-NEXT: vpand %ymm1, %ymm8, %ymm1
; AVX512VL-NEXT: vpshufb %ymm1, %ymm10, %ymm1
; AVX512VL-NEXT: vpaddb %ymm1, %ymm11, %ymm1
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm1, %ymm1
; AVX512VL-NEXT: vpsadbw %ymm1, %ymm9, %ymm1
; AVX512VL-NEXT: vpand %ymm2, %ymm8, %ymm11
; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11
; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
; AVX512VL-NEXT: vpand %ymm2, %ymm8, %ymm2
; AVX512VL-NEXT: vpshufb %ymm2, %ymm10, %ymm2
; AVX512VL-NEXT: vpaddb %ymm2, %ymm11, %ymm2
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm2, %ymm2
; AVX512VL-NEXT: vpsadbw %ymm2, %ymm9, %ymm2
; AVX512VL-NEXT: vpand %ymm3, %ymm8, %ymm11
; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11
; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512VL-NEXT: vpand %ymm3, %ymm8, %ymm3
; AVX512VL-NEXT: vpshufb %ymm3, %ymm10, %ymm3
; AVX512VL-NEXT: vpaddb %ymm3, %ymm11, %ymm3
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm3, %ymm3
; AVX512VL-NEXT: vpsadbw %ymm3, %ymm9, %ymm3
; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm11
; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11
; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm4
; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4
; AVX512VL-NEXT: vpshufb %ymm4, %ymm10, %ymm4
; AVX512VL-NEXT: vpaddb %ymm4, %ymm11, %ymm4
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm4, %ymm4
; AVX512VL-NEXT: vpsadbw %ymm4, %ymm9, %ymm4
; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm11
; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11
; AVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512VL-NEXT: vpshufb %ymm5, %ymm10, %ymm5
; AVX512VL-NEXT: vpaddb %ymm5, %ymm11, %ymm5
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm5, %ymm5
; AVX512VL-NEXT: vpsadbw %ymm5, %ymm9, %ymm5
; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm11
; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11
; AVX512VL-NEXT: vpsrlw $4, %ymm6, %ymm6
; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm6
; AVX512VL-NEXT: vpshufb %ymm6, %ymm10, %ymm6
; AVX512VL-NEXT: vpaddb %ymm6, %ymm11, %ymm6
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm6, %ymm6
; AVX512VL-NEXT: vpsadbw %ymm6, %ymm9, %ymm6
; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm11
; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11
; AVX512VL-NEXT: vpsrlw $4, %ymm7, %ymm7
; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7
; AVX512VL-NEXT: vpshufb %ymm7, %ymm10, %ymm7
; AVX512VL-NEXT: vpaddb %ymm7, %ymm11, %ymm7
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm7, %ymm7
; AVX512VL-NEXT: vpmovqb %ymm9, %xmm8
; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
; AVX512VL-NEXT: vpsadbw %xmm9, %xmm8, %xmm8
; AVX512VL-NEXT: vpsadbw %ymm7, %ymm9, %ymm7
; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1
; AVX512VL-NEXT: vpsadbw %xmm1, %xmm9, %xmm1
; AVX512VL-NEXT: vpmovqb %ymm2, %xmm2
; AVX512VL-NEXT: vpsadbw %xmm2, %xmm9, %xmm2
; AVX512VL-NEXT: vpmovqb %ymm3, %xmm3
; AVX512VL-NEXT: vpsadbw %xmm3, %xmm9, %xmm3
; AVX512VL-NEXT: vpmovqb %ymm4, %xmm4
; AVX512VL-NEXT: vpmovqb %ymm5, %xmm5
; AVX512VL-NEXT: vpmovqb %ymm6, %xmm6
; AVX512VL-NEXT: vpmovqb %ymm7, %xmm7
; AVX512VL-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm6, %ymm6
; AVX512VL-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
; AVX512VL-NEXT: vpsadbw %ymm0, %ymm4, %ymm0
; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,4,8,12]
; AVX512VL-NEXT: vpermi2d %ymm6, %ymm0, %ymm4
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,8,0,8]
; AVX512VL-NEXT: vpermi2d %ymm3, %ymm2, %ymm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm5
; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX512VL-NEXT: vpsadbw %zmm5, %zmm4, %zmm4
; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,12,8,12,8,12,0,4]
; AVX512VL-NEXT: vpermd %zmm4, %zmm6, %zmm4
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512VL-NEXT: vpsadbw %zmm5, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,12,0,4]
; AVX512VL-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
; AVX512VL-NEXT: retq
;
@@ -1576,30 +1574,27 @@ define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64>
; AVX512VPOPCNT-NEXT: vpopcntq %ymm6, %ymm6
; AVX512VPOPCNT-NEXT: vpopcntq %ymm7, %ymm7
; AVX512VPOPCNT-NEXT: vpmovqb %ymm0, %xmm0
; AVX512VPOPCNT-NEXT: vpxor %xmm8, %xmm8, %xmm8
; AVX512VPOPCNT-NEXT: vpsadbw %xmm0, %xmm8, %xmm0
; AVX512VPOPCNT-NEXT: vpmovqb %ymm1, %xmm1
; AVX512VPOPCNT-NEXT: vpsadbw %xmm1, %xmm8, %xmm1
; AVX512VPOPCNT-NEXT: vpmovqb %ymm2, %xmm2
; AVX512VPOPCNT-NEXT: vpsadbw %xmm2, %xmm8, %xmm2
; AVX512VPOPCNT-NEXT: vpmovqb %ymm3, %xmm3
; AVX512VPOPCNT-NEXT: vpsadbw %xmm3, %xmm8, %xmm3
; AVX512VPOPCNT-NEXT: vpmovqb %ymm4, %xmm4
; AVX512VPOPCNT-NEXT: vpmovqb %ymm5, %xmm5
; AVX512VPOPCNT-NEXT: vpmovqb %ymm6, %xmm6
; AVX512VPOPCNT-NEXT: vpmovqb %ymm7, %xmm7
; AVX512VPOPCNT-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,8,0,8]
; AVX512VPOPCNT-NEXT: vpermt2d %ymm3, %ymm8, %ymm2
; AVX512VPOPCNT-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512VPOPCNT-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm1
; AVX512VPOPCNT-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VPOPCNT-NEXT: vpsadbw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm3
; AVX512VPOPCNT-NEXT: vpsadbw %ymm2, %ymm3, %ymm2
; AVX512VPOPCNT-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,0,4,8,12]
; AVX512VPOPCNT-NEXT: vpermi2d %ymm1, %ymm2, %ymm3
; AVX512VPOPCNT-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm5
; AVX512VPOPCNT-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
; AVX512VPOPCNT-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX512VPOPCNT-NEXT: vpsadbw %zmm5, %zmm4, %zmm4
; AVX512VPOPCNT-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,12,8,12,8,12,0,4]
; AVX512VPOPCNT-NEXT: vpermd %zmm4, %zmm6, %zmm4
; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
; AVX512VPOPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512VPOPCNT-NEXT: vpsadbw %zmm5, %zmm0, %zmm0
; AVX512VPOPCNT-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,12,0,4]
; AVX512VPOPCNT-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512VPOPCNT-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
; AVX512VPOPCNT-NEXT: retq
%p0 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a0)
%p1 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a1)

View File

@@ -194,35 +194,21 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero
; AVX512VL-NEXT: vdivps %xmm9, %xmm8, %xmm8
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm10 = mem[0],zero
; AVX512VL-NEXT: vdivps %xmm10, %xmm9, %xmm9
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm10 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero
; AVX512VL-NEXT: vdivps %xmm11, %xmm10, %xmm10
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm12 = mem[0],zero
; AVX512VL-NEXT: vdivps %xmm12, %xmm11, %xmm11
; AVX512VL-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX512VL-NEXT: vdivps %ymm6, %ymm2, %ymm2
; AVX512VL-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vdivps %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
; AVX512VL-NEXT: vinsertf32x4 $1, %xmm11, %zmm10, %zmm0
; AVX512VL-NEXT: vinsertf32x4 $1, %xmm9, %zmm8, %zmm2
; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
; AVX512VL-NEXT: vdivps %xmm4, %xmm0, %xmm0
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
; AVX512VL-NEXT: vdivps %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
; AVX512VL-NEXT: vdivps %xmm4, %xmm2, %xmm2
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
; AVX512VL-NEXT: vdivps %xmm4, %xmm3, %xmm3
; AVX512VL-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,2,8,10,0,2,8,10]
; AVX512VL-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512VL-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1
; AVX512VL-NEXT: vmovups (%rdi), %ymm0
; AVX512VL-NEXT: vdivps (%rsi), %ymm0, %ymm0
; AVX512VL-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq