Files
clang-p2996/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-ldp.ll
Harvin Iriawan db158c7c83 [AArch64] Update generic sched model to A510
Refresh of the generic scheduling model to use A510 instead of A55.
  Main benefits are to the little core, and introducing SVE scheduling information.
  Changes tested on various OoO cores, no performance degradation is seen.

  Differential Revision: https://reviews.llvm.org/D156799
2023-08-21 12:25:15 +01:00

76 lines
3.2 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
; TODO: ldp don't support r+r, we need to avoid lsr optimize this pattern
define void @convolution(ptr %src0, ptr %src1, i64 %stride_xm, i64 %stride_xp, ptr %dst, i32 %w) {
; CHECK-LABEL: convolution:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add x9, x1, x3
; CHECK-NEXT: add x10, x1, x2
; CHECK-NEXT: add x11, x0, x2
; CHECK-NEXT: add x12, x0, x3
; CHECK-NEXT: .LBB0_1: // %do.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x13, x1, x8
; CHECK-NEXT: add x14, x0, x8
; CHECK-NEXT: ldr q0, [x11, x8]
; CHECK-NEXT: ldp q2, q3, [x14]
; CHECK-NEXT: ldr q1, [x12, x8]
; CHECK-NEXT: ldp q6, q7, [x13]
; CHECK-NEXT: subs w5, w5, #1
; CHECK-NEXT: ldr q4, [x10, x8]
; CHECK-NEXT: ldr q5, [x9, x8]
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: fadd v1.4s, v2.4s, v3.4s
; CHECK-NEXT: add x8, x8, #32
; CHECK-NEXT: fadd v2.4s, v4.4s, v5.4s
; CHECK-NEXT: fadd v3.4s, v6.4s, v7.4s
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: fadd v1.4s, v2.4s, v3.4s
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: str q0, [x4], #16
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.2: // %do.end
; CHECK-NEXT: ret
entry:
br label %do.body
do.body:
%dst.addr.0 = phi ptr [ %dst, %entry ], [ %incdec.ptr, %do.body ]
%src1.addr.0 = phi ptr [ %src1, %entry ], [ %incdec.ptr2.i7, %do.body ]
%src0.addr.0 = phi ptr [ %src0, %entry ], [ %incdec.ptr2.i, %do.body ]
%w.addr.0 = phi i32 [ %w, %entry ], [ %dec, %do.body ]
%add.ptr.i = getelementptr inbounds i8, ptr %src0.addr.0, i64 %stride_xm
%0 = load <4 x float>, ptr %add.ptr.i, align 16
%add.ptr1.i = getelementptr inbounds i8, ptr %src0.addr.0, i64 %stride_xp
%1 = load <4 x float>, ptr %add.ptr1.i, align 16
%incdec.ptr.i = getelementptr inbounds <4 x float>, ptr %src0.addr.0, i64 1
%2 = load <4 x float>, ptr %src0.addr.0, align 16
%incdec.ptr2.i = getelementptr inbounds <4 x float>, ptr %src0.addr.0, i64 2
%3 = load <4 x float>, ptr %incdec.ptr.i, align 16
%add.i = fadd <4 x float> %0, %1
%add3.i = fadd <4 x float> %2, %3
%add4.i = fadd <4 x float> %add.i, %add3.i
%add.ptr.i4 = getelementptr inbounds i8, ptr %src1.addr.0, i64 %stride_xm
%4 = load <4 x float>, ptr %add.ptr.i4, align 16
%add.ptr1.i5 = getelementptr inbounds i8, ptr %src1.addr.0, i64 %stride_xp
%5 = load <4 x float>, ptr %add.ptr1.i5, align 16
%incdec.ptr.i6 = getelementptr inbounds <4 x float>, ptr %src1.addr.0, i64 1
%6 = load <4 x float>, ptr %src1.addr.0, align 16
%incdec.ptr2.i7 = getelementptr inbounds <4 x float>, ptr %src1.addr.0, i64 2
%7 = load <4 x float>, ptr %incdec.ptr.i6, align 16
%add.i8 = fadd <4 x float> %4, %5
%add3.i9 = fadd <4 x float> %6, %7
%add4.i10 = fadd <4 x float> %add.i8, %add3.i9
%add = fadd <4 x float> %add4.i, %add4.i10
%incdec.ptr = getelementptr inbounds <4 x float>, ptr %dst.addr.0, i64 1
store <4 x float> %add, ptr %dst.addr.0, align 16
%dec = add nsw i32 %w.addr.0, -1
%tobool.not = icmp eq i32 %dec, 0
br i1 %tobool.not, label %do.end, label %do.body
do.end:
ret void
}