We already do this for most cases, with the exception of instructions that get expanded to function calls (e.g. for lowering operations on fp128 values), in which case we temporarily allocate a lazy-save buffer. The code that is generated in this case, is however incorrect, as it seems to pass an incorrect address for the TPIDR2 object to the ZA restore function. By always allocating the lazy-save buffer once, we avoid this issue entirely. The cost is that we also allocate such a buffer when it is not needed. We could fix that in a follow-up patch, where we remove the lazy-save buffer when it isn't used. Reviewed By: paulwalker-arm Differential Revision: https://reviews.llvm.org/D138208
69 lines
2.4 KiB
LLVM
69 lines
2.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
|
|
|
|
declare void @private_za_callee()
|
|
|
|
; Ensure that we don't use tail call optimization when a lazy-save is required.
|
|
define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind {
|
|
; CHECK-LABEL: disable_tailcallopt:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
; CHECK-NEXT: mov x29, sp
|
|
; CHECK-NEXT: sub sp, sp, #16
|
|
; CHECK-NEXT: rdsvl x8, #1
|
|
; CHECK-NEXT: mov x9, sp
|
|
; CHECK-NEXT: mul x8, x8, x8
|
|
; CHECK-NEXT: sub x9, x9, x8
|
|
; CHECK-NEXT: mov sp, x9
|
|
; CHECK-NEXT: sub x10, x29, #16
|
|
; CHECK-NEXT: stur x9, [x29, #-16]
|
|
; CHECK-NEXT: sturh w8, [x29, #-8]
|
|
; CHECK-NEXT: msr TPIDR2_EL0, x10
|
|
; CHECK-NEXT: bl private_za_callee
|
|
; CHECK-NEXT: smstart za
|
|
; CHECK-NEXT: sub x0, x29, #16
|
|
; CHECK-NEXT: mrs x8, TPIDR2_EL0
|
|
; CHECK-NEXT: cbnz x8, .LBB0_2
|
|
; CHECK-NEXT: // %bb.1:
|
|
; CHECK-NEXT: bl __arm_tpidr2_restore
|
|
; CHECK-NEXT: .LBB0_2:
|
|
; CHECK-NEXT: msr TPIDR2_EL0, xzr
|
|
; CHECK-NEXT: mov sp, x29
|
|
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
; CHECK-NEXT: ret
|
|
tail call void @private_za_callee()
|
|
ret void
|
|
}
|
|
|
|
; Ensure we set up and restore the lazy save correctly for instructions which are lowered to lib calls
|
|
define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwind {
|
|
; CHECK-LABEL: f128_call_za:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
; CHECK-NEXT: mov x29, sp
|
|
; CHECK-NEXT: sub sp, sp, #16
|
|
; CHECK-NEXT: rdsvl x8, #1
|
|
; CHECK-NEXT: mov x9, sp
|
|
; CHECK-NEXT: mul x8, x8, x8
|
|
; CHECK-NEXT: sub x9, x9, x8
|
|
; CHECK-NEXT: mov sp, x9
|
|
; CHECK-NEXT: sub x10, x29, #16
|
|
; CHECK-NEXT: stur x9, [x29, #-16]
|
|
; CHECK-NEXT: sturh w8, [x29, #-8]
|
|
; CHECK-NEXT: msr TPIDR2_EL0, x10
|
|
; CHECK-NEXT: bl __addtf3
|
|
; CHECK-NEXT: smstart za
|
|
; CHECK-NEXT: sub x0, x29, #16
|
|
; CHECK-NEXT: mrs x8, TPIDR2_EL0
|
|
; CHECK-NEXT: cbnz x8, .LBB1_2
|
|
; CHECK-NEXT: // %bb.1:
|
|
; CHECK-NEXT: bl __arm_tpidr2_restore
|
|
; CHECK-NEXT: .LBB1_2:
|
|
; CHECK-NEXT: msr TPIDR2_EL0, xzr
|
|
; CHECK-NEXT: mov sp, x29
|
|
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
; CHECK-NEXT: ret
|
|
%res = fadd fp128 %a, %b
|
|
ret fp128 %res
|
|
}
|