Files
clang-p2996/llvm/test/CodeGen/X86/fma-commute-loop.ll
Matthias Braun 189900eb14 X86: Stop assigning register costs for longer encodings.
This stops reporting CostPerUse 1 for `R8`-`R15` and `XMM8`-`XMM31`.
This was previously done because instruction encoding require a REX
prefix when using them resulting in longer instruction encodings. I
found that this regresses the quality of the register allocation as the
costs impose an ordering on eviction candidates. I also feel that there
is a bit of an impedance mismatch as the actual costs occure when
encoding instructions using those registers, but the order of VReg
assignments is not primarily ordered by number of Defs+Uses.

I did extensive measurements with the llvm-test-suite wiht SPEC2006 +
SPEC2017 included, internal services showed similar patterns. Generally
there are a log of improvements but also a lot of regression. But on
average the allocation quality seems to improve at a small code size
regression.

Results for measuring static and dynamic instruction counts:

Dynamic Counts (scaled by execution frequency) / Optimization Remarks:
    Spills+FoldedSpills   -5.6%
    Reloads+FoldedReloads -4.2%
    Copies                -0.1%

Static / LLVM Statistics:
    regalloc.NumSpills    mean -1.6%, geomean -2.8%
    regalloc.NumReloads   mean -1.7%, geomean -3.1%
    size..text            mean +0.4%, geomean +0.4%

Static / LLVM Statistics:
    mean -2.2%, geomean -3.1%) regalloc.NumSpills
    mean -2.6%, geomean -3.9%) regalloc.NumReloads
    mean +0.6%, geomean +0.6%) size..text

Static / LLVM Statistics:
    regalloc.NumSpills   mean -3.0%
    regalloc.NumReloads  mean -3.3%
    size..text           mean +0.3%, geomean +0.3%

Differential Revision: https://reviews.llvm.org/D133902
2022-09-30 16:01:33 -07:00

112 lines
5.3 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s
define void @eggs(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, ptr %arg13, ptr %arg14) nounwind {
; CHECK-LABEL: eggs:
; CHECK: ## %bb.0: ## %bb
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %r13
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12
; CHECK-NEXT: leaq (%r12,%r14,8), %r14
; CHECK-NEXT: leaq (%r12,%r15,8), %r15
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13
; CHECK-NEXT: addq %rbx, %r13
; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rbx
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vxorpd %xmm5, %xmm5, %xmm5
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_1: ## %bb15
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovupd (%rax,%r11,8), %zmm6
; CHECK-NEXT: vmovupd (%rax,%r13,8), %zmm7
; CHECK-NEXT: vmovupd (%rax,%rbx,8), %zmm8
; CHECK-NEXT: vbroadcastsd (%r15,%r12,8), %zmm9
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm6 * zmm9) + zmm0
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm7 * zmm9) + zmm1
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm8 * zmm9) + zmm2
; CHECK-NEXT: vbroadcastsd (%r14,%r12,8), %zmm9
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm9 * zmm6) + zmm3
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm9 * zmm7) + zmm4
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm8 * zmm9) + zmm5
; CHECK-NEXT: incq %r12
; CHECK-NEXT: cmpq %r12, %r10
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: ## %bb.2: ## %bb51
; CHECK-NEXT: vmovapd %zmm0, (%rdi)
; CHECK-NEXT: vmovapd %zmm1, (%rsi)
; CHECK-NEXT: vmovapd %zmm2, (%rdx)
; CHECK-NEXT: vmovapd %zmm3, (%rcx)
; CHECK-NEXT: vmovapd %zmm4, (%r8)
; CHECK-NEXT: vmovapd %zmm5, (%r9)
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
; CHECK-NEXT: popq %r13
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
bb:
br label %bb15
bb15: ; preds = %bb15, %bb
%tmp = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp38, %bb15 ]
%tmp16 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp39, %bb15 ]
%tmp17 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp40, %bb15 ]
%tmp18 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp46, %bb15 ]
%tmp19 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp47, %bb15 ]
%tmp20 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp48, %bb15 ]
%tmp21 = phi i64 [ 0, %bb ], [ %tmp49, %bb15 ]
%tmp22 = getelementptr inbounds double, ptr %arg14, i64 %arg11
%tmp24 = load <8 x double>, ptr %tmp22, align 8
%tmp25 = add i64 %arg10, %arg6
%tmp26 = getelementptr inbounds double, ptr %arg14, i64 %tmp25
%tmp28 = load <8 x double>, ptr %tmp26, align 8
%tmp29 = add i64 %arg10, %arg7
%tmp30 = getelementptr inbounds double, ptr %arg14, i64 %tmp29
%tmp32 = load <8 x double>, ptr %tmp30, align 8
%tmp33 = add i64 %tmp21, %arg8
%tmp34 = getelementptr inbounds double, ptr %arg13, i64 %tmp33
%tmp35 = load double, ptr %tmp34, align 8
%tmp36 = insertelement <8 x double> undef, double %tmp35, i32 0
%tmp37 = shufflevector <8 x double> %tmp36, <8 x double> undef, <8 x i32> zeroinitializer
%tmp38 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp24, <8 x double> %tmp37, <8 x double> %tmp)
%tmp39 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp28, <8 x double> %tmp37, <8 x double> %tmp16)
%tmp40 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp32, <8 x double> %tmp37, <8 x double> %tmp17)
%tmp41 = add i64 %tmp21, %arg9
%tmp42 = getelementptr inbounds double, ptr %arg13, i64 %tmp41
%tmp43 = load double, ptr %tmp42, align 8
%tmp44 = insertelement <8 x double> undef, double %tmp43, i32 0
%tmp45 = shufflevector <8 x double> %tmp44, <8 x double> undef, <8 x i32> zeroinitializer
%tmp46 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp24, <8 x double> %tmp45, <8 x double> %tmp18)
%tmp47 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp28, <8 x double> %tmp45, <8 x double> %tmp19)
%tmp48 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp32, <8 x double> %tmp45, <8 x double> %tmp20)
%tmp49 = add nuw nsw i64 %tmp21, 1
%tmp50 = icmp eq i64 %tmp49, %arg12
br i1 %tmp50, label %bb51, label %bb15
bb51: ; preds = %bb15
store <8 x double> %tmp38, ptr %arg
store <8 x double> %tmp39, ptr %arg1
store <8 x double> %tmp40, ptr %arg2
store <8 x double> %tmp46, ptr %arg3
store <8 x double> %tmp47, ptr %arg4
store <8 x double> %tmp48, ptr %arg5
ret void
}
declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>)