Files
clang-p2996/llvm/test/CodeGen/X86/pr29112.ll
Wang, Xin10 88eae6ef9f [DAGCombine]Expand usage of CreateBuildVecShuffle to make full use of vector ops
Now, when llc encounters the case that contains a lot of
extract_vector_elt and a BUILD_VECTOR, it will replace these to
vector_shuffle to decrease the size of code, the actions are done in
createBuildVecShuffle in DAGCombiner.cpp, but now the code cannot handle
the case that the size of source vector reg is more than twice the dest
size.

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D139685
2023-01-23 11:45:38 +08:00

99 lines
6.0 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s
declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)
; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16.
define <4 x float> @bar(ptr %a1p, ptr %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
; CHECK-LABEL: bar:
; CHECK: # %bb.0:
; CHECK-NEXT: subq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 144
; CHECK-NEXT: vmovaps %xmm1, %xmm13
; CHECK-NEXT: vmovaps {{.*#+}} xmm5 = [3,20,1,17]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm5
; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,1,2,3]
; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [4,21,1,17,4,21,5,21]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0
; CHECK-NEXT: vmovaps %zmm0, %zmm6
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,20,1,27]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4
; CHECK-NEXT: vmovaps {{.*#+}} ymm7 = [5,20,1,19,5,20,5,23]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7
; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [4,20,1,19,4,20,5,23]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0
; CHECK-NEXT: vmovaps {{.*#+}} xmm12 = [4,28,1,17]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm12
; CHECK-NEXT: vmovaps {{.*#+}} ymm8 = [5,20,1,17,5,20,5,21]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8
; CHECK-NEXT: vmovaps {{.*#+}} xmm9 = [4,30,1,22]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm9
; CHECK-NEXT: vmovaps {{.*#+}} ymm10 = [4,22,1,17,4,22,5,21]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10
; CHECK-NEXT: vmovaps {{.*#+}} ymm11 = [4,20,3,18,4,20,7,22]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm11
; CHECK-NEXT: vaddps %xmm10, %xmm11, %xmm2
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vaddps %xmm1, %xmm9, %xmm3
; CHECK-NEXT: vaddps %xmm12, %xmm8, %xmm9
; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm8
; CHECK-NEXT: vaddps %xmm0, %xmm10, %xmm0
; CHECK-NEXT: vaddps %xmm0, %xmm9, %xmm0
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovaps %xmm3, (%rsp)
; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 killed $ymm1
; CHECK-NEXT: vmovaps %xmm13, %xmm3
; CHECK-NEXT: # kill: def $xmm4 killed $xmm4 killed $zmm4
; CHECK-NEXT: # kill: def $xmm5 killed $xmm5 killed $zmm5
; CHECK-NEXT: # kill: def $xmm6 killed $xmm6 killed $zmm6
; CHECK-NEXT: # kill: def $xmm7 killed $xmm7 killed $zmm7
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo@PLT
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: addq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
%a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27>
%a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17>
%a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
%a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19>
%a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
%ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18>
%ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17>
%ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
%ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17>
%ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17>
%ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22>
%ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
%ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17>
%ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18>
%r1 = fadd <4 x float> %ay10, %ay9
%r2 = fadd <4 x float> %ay8, %ay7
%r3 = fadd <4 x float> %ay6, %ay5
%r4 = fadd <4 x float> %ay2, %ax10
%r5 = fadd <4 x float> %ay9, %ax8
%r6 = fadd <4 x float> %r5, %r3
%r7 = fadd <4 x float> %a9, %r6
%a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4)
%a12 = fadd <4 x float> %a2, %a1
%a13 = fadd <4 x float> %a12, %a11
ret <4 x float> %a13
}