This stops reporting CostPerUse 1 for `R8`-`R15` and `XMM8`-`XMM31`.
This was previously done because instruction encoding require a REX
prefix when using them resulting in longer instruction encodings. I
found that this regresses the quality of the register allocation as the
costs impose an ordering on eviction candidates. I also feel that there
is a bit of an impedance mismatch as the actual costs occure when
encoding instructions using those registers, but the order of VReg
assignments is not primarily ordered by number of Defs+Uses.
I did extensive measurements with the llvm-test-suite wiht SPEC2006 +
SPEC2017 included, internal services showed similar patterns. Generally
there are a log of improvements but also a lot of regression. But on
average the allocation quality seems to improve at a small code size
regression.
Results for measuring static and dynamic instruction counts:
Dynamic Counts (scaled by execution frequency) / Optimization Remarks:
Spills+FoldedSpills -5.6%
Reloads+FoldedReloads -4.2%
Copies -0.1%
Static / LLVM Statistics:
regalloc.NumSpills mean -1.6%, geomean -2.8%
regalloc.NumReloads mean -1.7%, geomean -3.1%
size..text mean +0.4%, geomean +0.4%
Static / LLVM Statistics:
mean -2.2%, geomean -3.1%) regalloc.NumSpills
mean -2.6%, geomean -3.9%) regalloc.NumReloads
mean +0.6%, geomean +0.6%) size..text
Static / LLVM Statistics:
regalloc.NumSpills mean -3.0%
regalloc.NumReloads mean -3.3%
size..text mean +0.3%, geomean +0.3%
Differential Revision: https://reviews.llvm.org/D133902
99 lines
6.1 KiB
LLVM
99 lines
6.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s
|
|
|
|
declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)
|
|
|
|
; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16.
|
|
|
|
define <4 x float> @bar(ptr %a1p, ptr %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
|
|
; CHECK-LABEL: bar:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: subq $72, %rsp
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 80
|
|
; CHECK-NEXT: vmovaps %xmm1, %xmm13
|
|
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4,22,1,17]
|
|
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0
|
|
; CHECK-NEXT: vmovaps {{.*#+}} xmm12 = [4,30,1,22]
|
|
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm12
|
|
; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29]
|
|
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8
|
|
; CHECK-NEXT: vmovaps {{.*#+}} xmm7 = <5,20,u,u>
|
|
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7
|
|
; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7]
|
|
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4
|
|
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5
|
|
; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm9
|
|
; CHECK-NEXT: vunpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm9[0,1],xmm2[1],xmm9[3]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
|
|
; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
|
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
|
|
; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3]
|
|
; CHECK-NEXT: vpermilps {{.*#+}} xmm11 = xmm2[3,3,3,3]
|
|
; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
|
|
; CHECK-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm7[0,1],xmm2[1],xmm7[3]
|
|
; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3]
|
|
; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[3]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[1]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm11[0,1,2],xmm3[1]
|
|
; CHECK-NEXT: vaddps %xmm8, %xmm11, %xmm8
|
|
; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,1],xmm2[3,3]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
|
|
; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2
|
|
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
|
; CHECK-NEXT: vaddps %xmm1, %xmm12, %xmm9
|
|
; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3
|
|
; CHECK-NEXT: vaddps %xmm0, %xmm10, %xmm0
|
|
; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0
|
|
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
|
; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: vmovaps %xmm9, (%rsp)
|
|
; CHECK-NEXT: vmovaps %xmm13, %xmm3
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: callq foo@PLT
|
|
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
|
; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
|
|
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
|
; CHECK-NEXT: addq $72, %rsp
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 8
|
|
; CHECK-NEXT: retq
|
|
%a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
|
|
%a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27>
|
|
%a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17>
|
|
%a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
|
|
%a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19>
|
|
%a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
|
|
%ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18>
|
|
%ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17>
|
|
%ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
|
|
%ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17>
|
|
%ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17>
|
|
%ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22>
|
|
%ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17>
|
|
%ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18>
|
|
|
|
%r1 = fadd <4 x float> %ay10, %ay9
|
|
%r2 = fadd <4 x float> %ay8, %ay7
|
|
%r3 = fadd <4 x float> %ay6, %ay5
|
|
%r4 = fadd <4 x float> %ay2, %ax10
|
|
%r5 = fadd <4 x float> %ay9, %ax8
|
|
%r6 = fadd <4 x float> %r5, %r3
|
|
%r7 = fadd <4 x float> %a9, %r6
|
|
%a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4)
|
|
%a12 = fadd <4 x float> %a2, %a1
|
|
%a13 = fadd <4 x float> %a12, %a11
|
|
|
|
ret <4 x float> %a13
|
|
}
|