diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 244b0f9410b8..9ea45513cc01 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4311,6 +4311,25 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops, } } + if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) { + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + uint64_t Idx = N->getConstantOperandVal(1); + + // Collect all the subvectors from the source vector and slice off the + // extraction. + SmallVector SrcOps; + if (collectConcatOps(Src.getNode(), SrcOps, DAG) && + VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() && + (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 && + (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) { + unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements(); + unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits(); + Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs); + return true; + } + } + return false; } diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 782a81be4760..674bad2c7aa8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -740,16 +740,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX512-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride3_vf16: @@ -763,16 +762,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride3_vf16: @@ -786,16 +784,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX512DQ-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride3_vf16: @@ -809,16 +806,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i8_stride3_vf16: @@ -832,16 +828,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX512BW-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i8_stride3_vf16: @@ -855,16 +850,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i8_stride3_vf16: @@ -878,16 +872,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf16: @@ -901,16 +894,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 47a6022e428c..7cddebdca5cc 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -962,16 +962,15 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqu %xmm0, 16(%rdi) +; AVX512-NEXT: vmovdqu %xmm1, (%rdi) ; AVX512-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32>