Refresh of the generic scheduling model to use A510 instead of A55. Main benefits are to the little core, and introducing SVE scheduling information. Changes tested on various OoO cores, no performance degradation is seen. Differential Revision: https://reviews.llvm.org/D156799
54 lines
2.0 KiB
LLVM
54 lines
2.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc %s --mtriple aarch64 -verify-machineinstrs -o - | FileCheck %s
|
|
|
|
define dso_local void @jsimd_idct_ifast_neon_intrinsic(ptr nocapture readonly %dct_table, ptr nocapture readonly %coef_block, ptr nocapture readonly %output_buf, i32 %output_col) local_unnamed_addr #0 {
|
|
; CHECK-LABEL: jsimd_idct_ifast_neon_intrinsic:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: ldr q0, [x1, #32]
|
|
; CHECK-NEXT: ldr q1, [x1, #96]
|
|
; CHECK-NEXT: mov w9, w3
|
|
; CHECK-NEXT: ldr q2, [x0, #32]
|
|
; CHECK-NEXT: ldr q3, [x0, #96]
|
|
; CHECK-NEXT: ldr x8, [x2, #48]
|
|
; CHECK-NEXT: mul v0.8h, v2.8h, v0.8h
|
|
; CHECK-NEXT: mul v1.8h, v3.8h, v1.8h
|
|
; CHECK-NEXT: add v2.8h, v0.8h, v1.8h
|
|
; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
|
|
; CHECK-NEXT: str q2, [x8, x9]
|
|
; CHECK-NEXT: ldr x8, [x2, #56]
|
|
; CHECK-NEXT: str q0, [x8, x9]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%add.ptr5 = getelementptr inbounds i16, ptr %coef_block, i64 16
|
|
%0 = load <8 x i16>, ptr %add.ptr5, align 16
|
|
|
|
%add.ptr17 = getelementptr inbounds i16, ptr %coef_block, i64 48
|
|
%1 = load <8 x i16>, ptr %add.ptr17, align 16
|
|
|
|
%add.ptr29 = getelementptr inbounds i8, ptr %dct_table, i64 32
|
|
%2 = load <8 x i16>, ptr %add.ptr29, align 16
|
|
|
|
%add.ptr41 = getelementptr inbounds i8, ptr %dct_table, i64 96
|
|
%3 = load <8 x i16>, ptr %add.ptr41, align 16
|
|
|
|
%mul.i966 = mul <8 x i16> %2, %0
|
|
%mul.i964 = mul <8 x i16> %3, %1
|
|
|
|
%add.i961 = add <8 x i16> %mul.i966, %mul.i964
|
|
%sub.i960 = sub <8 x i16> %mul.i966, %mul.i964
|
|
|
|
%idx.ext = zext i32 %output_col to i64
|
|
|
|
%arrayidx404 = getelementptr inbounds ptr, ptr %output_buf, i64 6
|
|
%4 = load ptr, ptr %arrayidx404, align 8
|
|
%add.ptr406 = getelementptr inbounds i8, ptr %4, i64 %idx.ext
|
|
store <8 x i16> %add.i961, ptr %add.ptr406, align 8
|
|
|
|
%arrayidx408 = getelementptr inbounds ptr, ptr %output_buf, i64 7
|
|
%5 = load ptr, ptr %arrayidx408, align 8
|
|
%add.ptr410 = getelementptr inbounds i8, ptr %5, i64 %idx.ext
|
|
store <8 x i16> %sub.i960, ptr %add.ptr410, align 8
|
|
|
|
ret void
|
|
}
|