If we don't demand the same element from both single source shuffles (permutes), then attempt to blend the sources together first and then perform a merged permute. For vXi16 blends we have to be careful as these are much more likely to involve byte/word vector shuffles that will result in the creation of additional shuffle instructions. This fold might be worth it for VSELECT with constant masks on AVX512 targets, but I haven't investigated this yet, but I've tried to write combineBlendOfPermutes so to be prepared for this. The PR34592 -O0 regression is an unfortunate failure to cleanup with a later pass that calls SimplifyDemandedElts like the -O3 does - I'm not sure how worried we should be tbh.
83 lines
5.0 KiB
LLVM
83 lines
5.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 -O0 | FileCheck %s --check-prefixes=CHECK-O0
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 -O3 | FileCheck %s --check-prefixes=CHECK-O3
|
|
|
|
define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <16 x i64> %arg3, <16 x i64> %arg4) nounwind {
|
|
; CHECK-O0-LABEL: pluto:
|
|
; CHECK-O0: # %bb.0: # %bb
|
|
; CHECK-O0-NEXT: pushq %rbp
|
|
; CHECK-O0-NEXT: movq %rsp, %rbp
|
|
; CHECK-O0-NEXT: andq $-32, %rsp
|
|
; CHECK-O0-NEXT: subq $64, %rsp
|
|
; CHECK-O0-NEXT: vmovaps %ymm4, %ymm10
|
|
; CHECK-O0-NEXT: vmovaps %ymm3, %ymm9
|
|
; CHECK-O0-NEXT: vmovaps %ymm2, (%rsp) # 32-byte Spill
|
|
; CHECK-O0-NEXT: vmovaps %ymm1, %ymm8
|
|
; CHECK-O0-NEXT: vmovaps %ymm0, %ymm3
|
|
; CHECK-O0-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload
|
|
; CHECK-O0-NEXT: vmovaps 240(%rbp), %ymm4
|
|
; CHECK-O0-NEXT: vmovaps 208(%rbp), %ymm1
|
|
; CHECK-O0-NEXT: vmovaps 176(%rbp), %ymm2
|
|
; CHECK-O0-NEXT: vmovaps 144(%rbp), %ymm2
|
|
; CHECK-O0-NEXT: vmovaps 112(%rbp), %ymm11
|
|
; CHECK-O0-NEXT: vmovaps 80(%rbp), %ymm11
|
|
; CHECK-O0-NEXT: vmovaps 48(%rbp), %ymm11
|
|
; CHECK-O0-NEXT: vmovaps 16(%rbp), %ymm11
|
|
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
|
|
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
|
|
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
|
|
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,1]
|
|
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7]
|
|
; CHECK-O0-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm6[0,1]
|
|
; CHECK-O0-NEXT: vxorps %xmm3, %xmm3, %xmm3
|
|
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
|
|
; CHECK-O0-NEXT: vmovaps %xmm1, %xmm3
|
|
; CHECK-O0-NEXT: vmovaps %xmm7, %xmm1
|
|
; CHECK-O0-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
|
|
; CHECK-O0-NEXT: # implicit-def: $ymm1
|
|
; CHECK-O0-NEXT: vmovaps %xmm3, %xmm1
|
|
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,3]
|
|
; CHECK-O0-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
|
|
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3,4,5],ymm1[6,7]
|
|
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
|
|
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
|
|
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
|
|
; CHECK-O0-NEXT: movq %rbp, %rsp
|
|
; CHECK-O0-NEXT: popq %rbp
|
|
; CHECK-O0-NEXT: retq
|
|
;
|
|
; CHECK-O3-LABEL: pluto:
|
|
; CHECK-O3: # %bb.0: # %bb
|
|
; CHECK-O3-NEXT: pushq %rbp
|
|
; CHECK-O3-NEXT: movq %rsp, %rbp
|
|
; CHECK-O3-NEXT: andq $-32, %rsp
|
|
; CHECK-O3-NEXT: subq $32, %rsp
|
|
; CHECK-O3-NEXT: vmovdqa 208(%rbp), %ymm3
|
|
; CHECK-O3-NEXT: vmovdqa 144(%rbp), %ymm0
|
|
; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6,7]
|
|
; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
|
|
; CHECK-O3-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
|
; CHECK-O3-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,1]
|
|
; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
|
|
; CHECK-O3-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
|
|
; CHECK-O3-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
|
|
; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
|
|
; CHECK-O3-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
|
|
; CHECK-O3-NEXT: vpbroadcastq 248(%rbp), %ymm4
|
|
; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7]
|
|
; CHECK-O3-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
|
|
; CHECK-O3-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
|
|
; CHECK-O3-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
|
|
; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7]
|
|
; CHECK-O3-NEXT: movq %rbp, %rsp
|
|
; CHECK-O3-NEXT: popq %rbp
|
|
; CHECK-O3-NEXT: retq
|
|
bb:
|
|
%tmp = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <16 x i64> %arg, <16 x i64> %arg1
|
|
%tmp5 = select <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i64> %arg2, <16 x i64> zeroinitializer
|
|
%tmp6 = select <16 x i1> <i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true>, <16 x i64> %arg3, <16 x i64> %tmp5
|
|
%tmp7 = shufflevector <16 x i64> %tmp, <16 x i64> %tmp6, <16 x i32> <i32 11, i32 18, i32 24, i32 9, i32 14, i32 29, i32 29, i32 6, i32 14, i32 28, i32 8, i32 9, i32 22, i32 12, i32 25, i32 6>
|
|
ret <16 x i64> %tmp7
|
|
}
|