Sometimes, loads can appear in a loop after the LICM pass is executed the final time. For example, ExpandMemCmp pass creates loads in a loop, and one of the operands may be an invariant address. This patch extends the pre-regalloc stage MachineLICM by allowing to hoist invariant loads from loops that don't have any stores or calls and allows load reorderings.
110 lines
5.2 KiB
LLVM
110 lines
5.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s
|
|
|
|
define void @eggs(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, ptr %arg13, ptr %arg14) nounwind {
|
|
; CHECK-LABEL: eggs:
|
|
; CHECK: ## %bb.0: ## %bb
|
|
; CHECK-NEXT: pushq %r15
|
|
; CHECK-NEXT: pushq %r14
|
|
; CHECK-NEXT: pushq %r12
|
|
; CHECK-NEXT: pushq %rbx
|
|
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14
|
|
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
|
|
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12
|
|
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
|
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
|
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
|
|
; CHECK-NEXT: leaq (%rbx,%r10,8), %r10
|
|
; CHECK-NEXT: leaq (%rbx,%r11,8), %r11
|
|
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
|
|
; CHECK-NEXT: xorl %ebx, %ebx
|
|
; CHECK-NEXT: vmovupd (%r14,%r15,8), %zmm1
|
|
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
|
|
; CHECK-NEXT: addq %r12, %r15
|
|
; CHECK-NEXT: vmovupd (%r14,%r15,8), %zmm2
|
|
; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r12
|
|
; CHECK-NEXT: vmovupd (%r14,%r12,8), %zmm8
|
|
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
|
|
; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
|
|
; CHECK-NEXT: vxorpd %xmm5, %xmm5, %xmm5
|
|
; CHECK-NEXT: vxorpd %xmm6, %xmm6, %xmm6
|
|
; CHECK-NEXT: vxorpd %xmm7, %xmm7, %xmm7
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: LBB0_1: ## %bb15
|
|
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vbroadcastsd (%r11,%rbx,8), %zmm9
|
|
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm9) + zmm0
|
|
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm2 * zmm9) + zmm3
|
|
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm8 * zmm9) + zmm4
|
|
; CHECK-NEXT: vbroadcastsd (%r10,%rbx,8), %zmm9
|
|
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm1 * zmm9) + zmm5
|
|
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm6 = (zmm2 * zmm9) + zmm6
|
|
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm7 = (zmm8 * zmm9) + zmm7
|
|
; CHECK-NEXT: incq %rbx
|
|
; CHECK-NEXT: cmpq %rbx, %rax
|
|
; CHECK-NEXT: jne LBB0_1
|
|
; CHECK-NEXT: ## %bb.2: ## %bb51
|
|
; CHECK-NEXT: vmovapd %zmm0, (%rdi)
|
|
; CHECK-NEXT: vmovapd %zmm3, (%rsi)
|
|
; CHECK-NEXT: vmovapd %zmm4, (%rdx)
|
|
; CHECK-NEXT: vmovapd %zmm5, (%rcx)
|
|
; CHECK-NEXT: vmovapd %zmm6, (%r8)
|
|
; CHECK-NEXT: vmovapd %zmm7, (%r9)
|
|
; CHECK-NEXT: popq %rbx
|
|
; CHECK-NEXT: popq %r12
|
|
; CHECK-NEXT: popq %r14
|
|
; CHECK-NEXT: popq %r15
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb15
|
|
|
|
bb15: ; preds = %bb15, %bb
|
|
%tmp = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp38, %bb15 ]
|
|
%tmp16 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp39, %bb15 ]
|
|
%tmp17 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp40, %bb15 ]
|
|
%tmp18 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp46, %bb15 ]
|
|
%tmp19 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp47, %bb15 ]
|
|
%tmp20 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp48, %bb15 ]
|
|
%tmp21 = phi i64 [ 0, %bb ], [ %tmp49, %bb15 ]
|
|
%tmp22 = getelementptr inbounds double, ptr %arg14, i64 %arg11
|
|
%tmp24 = load <8 x double>, ptr %tmp22, align 8
|
|
%tmp25 = add i64 %arg10, %arg6
|
|
%tmp26 = getelementptr inbounds double, ptr %arg14, i64 %tmp25
|
|
%tmp28 = load <8 x double>, ptr %tmp26, align 8
|
|
%tmp29 = add i64 %arg10, %arg7
|
|
%tmp30 = getelementptr inbounds double, ptr %arg14, i64 %tmp29
|
|
%tmp32 = load <8 x double>, ptr %tmp30, align 8
|
|
%tmp33 = add i64 %tmp21, %arg8
|
|
%tmp34 = getelementptr inbounds double, ptr %arg13, i64 %tmp33
|
|
%tmp35 = load double, ptr %tmp34, align 8
|
|
%tmp36 = insertelement <8 x double> undef, double %tmp35, i32 0
|
|
%tmp37 = shufflevector <8 x double> %tmp36, <8 x double> undef, <8 x i32> zeroinitializer
|
|
%tmp38 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp24, <8 x double> %tmp37, <8 x double> %tmp)
|
|
%tmp39 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp28, <8 x double> %tmp37, <8 x double> %tmp16)
|
|
%tmp40 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp32, <8 x double> %tmp37, <8 x double> %tmp17)
|
|
%tmp41 = add i64 %tmp21, %arg9
|
|
%tmp42 = getelementptr inbounds double, ptr %arg13, i64 %tmp41
|
|
%tmp43 = load double, ptr %tmp42, align 8
|
|
%tmp44 = insertelement <8 x double> undef, double %tmp43, i32 0
|
|
%tmp45 = shufflevector <8 x double> %tmp44, <8 x double> undef, <8 x i32> zeroinitializer
|
|
%tmp46 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp24, <8 x double> %tmp45, <8 x double> %tmp18)
|
|
%tmp47 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp28, <8 x double> %tmp45, <8 x double> %tmp19)
|
|
%tmp48 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp32, <8 x double> %tmp45, <8 x double> %tmp20)
|
|
%tmp49 = add nuw nsw i64 %tmp21, 1
|
|
%tmp50 = icmp eq i64 %tmp49, %arg12
|
|
br i1 %tmp50, label %bb51, label %bb15
|
|
|
|
bb51: ; preds = %bb15
|
|
store <8 x double> %tmp38, ptr %arg
|
|
store <8 x double> %tmp39, ptr %arg1
|
|
store <8 x double> %tmp40, ptr %arg2
|
|
store <8 x double> %tmp46, ptr %arg3
|
|
store <8 x double> %tmp47, ptr %arg4
|
|
store <8 x double> %tmp48, ptr %arg5
|
|
ret void
|
|
}
|
|
|
|
declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>)
|