Files
clang-p2996/llvm/test/CodeGen/X86/pr29112.ll
Simon Pilgrim 01ae462fef [X86][SSE] Combine (some) target shuffles with multiple uses
As discussed on D41794, we have many cases where we fail to combine shuffles as the input operands have other uses.

This patch permits these shuffles to be combined as long as they don't introduce additional variable shuffle masks, which should reduce instruction dependencies and allow the total number of shuffles to still drop without increasing the constant pool.

However, this may mean that some memory folds may no longer occur, and on pre-AVX require the occasional extra register move.

This also exposes some poor PMULDQ/PMULUDQ codegen which was doing unnecessary upper/lower calculations which will in fact fold to zero/undef - the fix will be added in a followup commit.

Differential Revision: https://reviews.llvm.org/D50328

llvm-svn: 339335
2018-08-09 12:30:02 +00:00

105 lines
6.6 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s
declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)
; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16.
define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
; CHECK-LABEL: bar:
; CHECK: # %bb.0:
; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 80
; CHECK-NEXT: vmovaps %xmm1, %xmm8
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm5
; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[0,1],xmm2[1],xmm10[3]
; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0],xmm1[1],xmm5[2,3]
; CHECK-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
; CHECK-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm6[0,1],xmm2[1],xmm6[3]
; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7
; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm11[0,1,2],xmm3[1]
; CHECK-NEXT: vaddps %xmm4, %xmm6, %xmm12
; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm7[2],zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm4[0]
; CHECK-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[3,1,2,3]
; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm1[2],zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[1]
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm2[1],xmm9[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0,1,2],xmm3[1]
; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm4[0,1],xmm2[1],xmm4[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0,1],xmm2[1],xmm7[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3]
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3]
; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm3[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1,2],xmm3[1]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2
; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9
; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovaps %xmm9, (%rsp)
; CHECK-NEXT: vmovaps %xmm8, %xmm3
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: addq $72, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
%a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27>
%a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17>
%a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
%a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19>
%a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
%ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18>
%ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17>
%ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
%ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17>
%ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17>
%ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22>
%ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17>
%ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18>
%r1 = fadd <4 x float> %ay10, %ay9
%r2 = fadd <4 x float> %ay8, %ay7
%r3 = fadd <4 x float> %ay6, %ay5
%r4 = fadd <4 x float> %ay2, %ax10
%r5 = fadd <4 x float> %ay9, %ax8
%r6 = fadd <4 x float> %r5, %r3
%r7 = fadd <4 x float> %a9, %r6
%a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4)
%a12 = fadd <4 x float> %a2, %a1
%a13 = fadd <4 x float> %a12, %a11
ret <4 x float> %a13
}