Refresh of the generic scheduling model to use A510 instead of A55. Main benefits are to the little core, and introducing SVE scheduling information. Changes tested on various OoO cores, no performance degradation is seen. Differential Revision: https://reviews.llvm.org/D156799
127 lines
6.1 KiB
LLVM
127 lines
6.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: llc -mtriple=aarch64 -mattr=+fp16fml -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
|
|
|
|
; This tests that the fmlal/fmlal2 instructions only accept lo registers for
|
|
; the index operand, using inline asm to force the available registers.
|
|
|
|
define <4 x float> @test(ptr %lhs_panel, ptr %rhs_panel, <4 x float> %a) {
|
|
; CHECK-LABEL: test:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset b8, -16
|
|
; CHECK-NEXT: fmov x8, d0
|
|
; CHECK-NEXT: ldr q8, [x0]
|
|
; CHECK-NEXT: ldr q16, [x1]
|
|
; CHECK-NEXT: lsr x9, x8, #32
|
|
; CHECK-NEXT: //APP
|
|
; CHECK-NEXT: nop
|
|
; CHECK-NEXT: //NO_APP
|
|
; CHECK-NEXT: mov w8, w8
|
|
; CHECK-NEXT: orr x8, x8, x9, lsl #32
|
|
; CHECK-NEXT: fmov d0, x8
|
|
; CHECK-NEXT: fmlal v0.4s, v16.4h, v8.h[0]
|
|
; CHECK-NEXT: mov v1.16b, v0.16b
|
|
; CHECK-NEXT: fmlal2 v1.4s, v16.4h, v8.h[0]
|
|
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
|
|
; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = load <8 x half>, ptr %lhs_panel, align 2
|
|
%1 = load <8 x half>, ptr %rhs_panel, align 2
|
|
%vecinit91 = shufflevector <8 x half> %0, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%b = call <4 x float> asm sideeffect "nop", "=r,r,~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7}"(<4 x float> %a) nounwind
|
|
%vfmlal_low3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %b, <8 x half> %1, <8 x half> %vecinit91)
|
|
%vfmlal_high3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %vfmlal_low3.i, <8 x half> %1, <8 x half> %vecinit91)
|
|
%z = fadd <4 x float> %vfmlal_low3.i, %vfmlal_high3.i
|
|
ret <4 x float> %z
|
|
}
|
|
|
|
define void @loop(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K, i32 noundef %flags) {
|
|
; CHECK-LABEL: loop:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: movi v0.2d, #0000000000000000
|
|
; CHECK-NEXT: movi v1.2d, #0000000000000000
|
|
; CHECK-NEXT: mov w8, w3
|
|
; CHECK-NEXT: .LBB1_1: // %for.body
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr q2, [x1], #2
|
|
; CHECK-NEXT: subs x8, x8, #1
|
|
; CHECK-NEXT: ldr q3, [x2], #2
|
|
; CHECK-NEXT: fmlal v0.4s, v3.4h, v2.h[0]
|
|
; CHECK-NEXT: fmlal2 v1.4s, v3.4h, v2.h[0]
|
|
; CHECK-NEXT: b.ne .LBB1_1
|
|
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
|
|
; CHECK-NEXT: stp q0, q1, [x0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%wide.trip.count = zext i32 %K to i64
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body
|
|
store <4 x float> %vfmlal_low3.i, ptr %out_tile, align 4
|
|
%add.ptr1399 = getelementptr inbounds float, ptr %out_tile, i64 4
|
|
store <4 x float> %vfmlal_high3.i, ptr %add.ptr1399, align 4
|
|
ret void
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
|
%acc0.01714 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_low3.i, %for.body ]
|
|
%acc1.01713 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_high3.i, %for.body ]
|
|
%add.ptr = getelementptr inbounds half, ptr %lhs_panel, i64 %indvars.iv
|
|
%0 = load <8 x half>, ptr %add.ptr, align 2
|
|
%add.ptr19 = getelementptr inbounds half, ptr %rhs_panel, i64 %indvars.iv
|
|
%1 = load <8 x half>, ptr %add.ptr19, align 2
|
|
%vecinit93 = shufflevector <8 x half> %0, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%vfmlal_low3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %acc0.01714, <8 x half> %1, <8 x half> %vecinit93)
|
|
%vfmlal_high3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %acc1.01713, <8 x half> %1, <8 x half> %vecinit93)
|
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
|
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
|
|
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define void @sink(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K, i32 noundef %flags, <8 x half> noundef %lhs) {
|
|
; CHECK-LABEL: sink:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: movi v1.2d, #0000000000000000
|
|
; CHECK-NEXT: movi v2.2d, #0000000000000000
|
|
; CHECK-NEXT: mov w8, w3
|
|
; CHECK-NEXT: .LBB2_1: // %for.body
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr q3, [x2], #2
|
|
; CHECK-NEXT: subs x8, x8, #1
|
|
; CHECK-NEXT: fmlal v1.4s, v3.4h, v0.h[0]
|
|
; CHECK-NEXT: fmlal2 v2.4s, v3.4h, v0.h[0]
|
|
; CHECK-NEXT: b.ne .LBB2_1
|
|
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
|
|
; CHECK-NEXT: stp q1, q2, [x0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vecinit89 = shufflevector <8 x half> %lhs, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%wide.trip.count = zext i32 %K to i64
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body
|
|
store <4 x float> %vfmlal_low3.i, ptr %out_tile, align 4
|
|
%add.ptr1395 = getelementptr inbounds float, ptr %out_tile, i64 4
|
|
store <4 x float> %vfmlal_high3.i, ptr %add.ptr1395, align 4
|
|
ret void
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
|
%acc0.01702 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_low3.i, %for.body ]
|
|
%acc1.01701 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_high3.i, %for.body ]
|
|
%add.ptr = getelementptr inbounds half, ptr %rhs_panel, i64 %indvars.iv
|
|
%0 = load <8 x half>, ptr %add.ptr, align 2
|
|
%vfmlal_low3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %acc0.01702, <8 x half> %0, <8 x half> %vecinit89)
|
|
%vfmlal_high3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %acc1.01701, <8 x half> %0, <8 x half> %vecinit89)
|
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
|
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
|
|
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
|
|
declare <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) #2
|
|
declare <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) #2
|
|
|