Files
clang-p2996/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
Harvin Iriawan db158c7c83 [AArch64] Update generic sched model to A510
Refresh of the generic scheduling model to use A510 instead of A55.
  Main benefits are to the little core, and introducing SVE scheduling information.
  Changes tested on various OoO cores, no performance degradation is seen.

  Differential Revision: https://reviews.llvm.org/D156799
2023-08-21 12:25:15 +01:00

127 lines
6.1 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=aarch64 -mattr=+fp16fml -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
; This tests that the fmlal/fmlal2 instructions only accept lo registers for
; the index operand, using inline asm to force the available registers.
define <4 x float> @test(ptr %lhs_panel, ptr %rhs_panel, <4 x float> %a) {
; CHECK-LABEL: test:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset b8, -16
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: ldr q8, [x0]
; CHECK-NEXT: ldr q16, [x1]
; CHECK-NEXT: lsr x9, x8, #32
; CHECK-NEXT: //APP
; CHECK-NEXT: nop
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: mov w8, w8
; CHECK-NEXT: orr x8, x8, x9, lsl #32
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: fmlal v0.4s, v16.4h, v8.h[0]
; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: fmlal2 v1.4s, v16.4h, v8.h[0]
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = load <8 x half>, ptr %lhs_panel, align 2
%1 = load <8 x half>, ptr %rhs_panel, align 2
%vecinit91 = shufflevector <8 x half> %0, <8 x half> undef, <8 x i32> zeroinitializer
%b = call <4 x float> asm sideeffect "nop", "=r,r,~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7}"(<4 x float> %a) nounwind
%vfmlal_low3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %b, <8 x half> %1, <8 x half> %vecinit91)
%vfmlal_high3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %vfmlal_low3.i, <8 x half> %1, <8 x half> %vecinit91)
%z = fadd <4 x float> %vfmlal_low3.i, %vfmlal_high3.i
ret <4 x float> %z
}
define void @loop(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K, i32 noundef %flags) {
; CHECK-LABEL: loop:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: mov w8, w3
; CHECK-NEXT: .LBB1_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr q2, [x1], #2
; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: ldr q3, [x2], #2
; CHECK-NEXT: fmlal v0.4s, v3.4h, v2.h[0]
; CHECK-NEXT: fmlal2 v1.4s, v3.4h, v2.h[0]
; CHECK-NEXT: b.ne .LBB1_1
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
entry:
%wide.trip.count = zext i32 %K to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body
store <4 x float> %vfmlal_low3.i, ptr %out_tile, align 4
%add.ptr1399 = getelementptr inbounds float, ptr %out_tile, i64 4
store <4 x float> %vfmlal_high3.i, ptr %add.ptr1399, align 4
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%acc0.01714 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_low3.i, %for.body ]
%acc1.01713 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_high3.i, %for.body ]
%add.ptr = getelementptr inbounds half, ptr %lhs_panel, i64 %indvars.iv
%0 = load <8 x half>, ptr %add.ptr, align 2
%add.ptr19 = getelementptr inbounds half, ptr %rhs_panel, i64 %indvars.iv
%1 = load <8 x half>, ptr %add.ptr19, align 2
%vecinit93 = shufflevector <8 x half> %0, <8 x half> undef, <8 x i32> zeroinitializer
%vfmlal_low3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %acc0.01714, <8 x half> %1, <8 x half> %vecinit93)
%vfmlal_high3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %acc1.01713, <8 x half> %1, <8 x half> %vecinit93)
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
define void @sink(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K, i32 noundef %flags, <8 x half> noundef %lhs) {
; CHECK-LABEL: sink:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: movi v2.2d, #0000000000000000
; CHECK-NEXT: mov w8, w3
; CHECK-NEXT: .LBB2_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr q3, [x2], #2
; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fmlal v1.4s, v3.4h, v0.h[0]
; CHECK-NEXT: fmlal2 v2.4s, v3.4h, v0.h[0]
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
entry:
%vecinit89 = shufflevector <8 x half> %lhs, <8 x half> undef, <8 x i32> zeroinitializer
%wide.trip.count = zext i32 %K to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body
store <4 x float> %vfmlal_low3.i, ptr %out_tile, align 4
%add.ptr1395 = getelementptr inbounds float, ptr %out_tile, i64 4
store <4 x float> %vfmlal_high3.i, ptr %add.ptr1395, align 4
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%acc0.01702 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_low3.i, %for.body ]
%acc1.01701 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_high3.i, %for.body ]
%add.ptr = getelementptr inbounds half, ptr %rhs_panel, i64 %indvars.iv
%0 = load <8 x half>, ptr %add.ptr, align 2
%vfmlal_low3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %acc0.01702, <8 x half> %0, <8 x half> %vecinit89)
%vfmlal_high3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %acc1.01701, <8 x half> %0, <8 x half> %vecinit89)
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
declare <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) #2
declare <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) #2