This is a tiny change that can save up to 16 bytes of stack allocation, which is more beneficial on RV32 than RV64. cm.push allocates multiples of 16 bytes, but only uses a subset of those bytes for pushing callee-saved registers. Up to 12 (rv32) or 8 (rv64) bytes are left unused, depending on how many registers are pushed. Before this change, we told LLVM that the entire allocation was used, by creating a fixed stack object which covered the whole allocation. This change instead gives an accurate extent to the fixed stack object, to only cover the registers that have been pushed. This allows the PrologEpilogInserter to use any unused bytes for spills. Potentially this saves an extra move of the stack pointer after the push, because the push can allocate up to 48 more bytes than it needs for registers. We cannot do the same change for save/restore, because the restore routines restore in batches of `stackalign/(xlen/8)` registers, and we don't want to clobber the saved values of registers that we didn't tell the compiler we were saving/restoring - for instance `__riscv_restore_0` is used by the compiler when it only wants to save `ra`, but will end up restoring `ra` and `s0`.
54 lines
2.0 KiB
LLVM
54 lines
2.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: llc -mtriple=riscv32 -mattr=+zcmp,+e -target-abi ilp32e -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32
|
|
define ptr @func(ptr %s, i32 %_c, ptr %incdec.ptr, i1 %0, i8 %conv14) #0 {
|
|
; RV32-LABEL: func:
|
|
; RV32: # %bb.0: # %entry
|
|
; RV32-NEXT: cm.push {ra, s0-s1}, -16
|
|
; RV32-NEXT: .cfi_def_cfa_offset 16
|
|
; RV32-NEXT: .cfi_offset ra, -12
|
|
; RV32-NEXT: .cfi_offset s0, -8
|
|
; RV32-NEXT: .cfi_offset s1, -4
|
|
; RV32-NEXT: addi sp, sp, -4
|
|
; RV32-NEXT: .cfi_def_cfa_offset 20
|
|
; RV32-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
|
|
; RV32-NEXT: sw a2, 0(sp) # 4-byte Folded Spill
|
|
; RV32-NEXT: mv a2, a1
|
|
; RV32-NEXT: mv s1, a0
|
|
; RV32-NEXT: li a0, 1
|
|
; RV32-NEXT: andi a3, a3, 1
|
|
; RV32-NEXT: .LBB0_1: # %while.body
|
|
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV32-NEXT: mv s0, a0
|
|
; RV32-NEXT: li a0, 0
|
|
; RV32-NEXT: bnez a3, .LBB0_1
|
|
; RV32-NEXT: # %bb.2: # %while.end
|
|
; RV32-NEXT: lui a0, 4112
|
|
; RV32-NEXT: addi a1, a0, 257
|
|
; RV32-NEXT: mv a0, a2
|
|
; RV32-NEXT: call __mulsi3
|
|
; RV32-NEXT: sw a0, 0(zero)
|
|
; RV32-NEXT: andi s0, s0, 1
|
|
; RV32-NEXT: lw a0, 0(sp) # 4-byte Folded Reload
|
|
; RV32-NEXT: add s0, s0, a0
|
|
; RV32-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
|
|
; RV32-NEXT: sb a0, 0(s0)
|
|
; RV32-NEXT: mv a0, s1
|
|
; RV32-NEXT: addi sp, sp, 4
|
|
; RV32-NEXT: .cfi_def_cfa_offset 16
|
|
; RV32-NEXT: cm.popret {ra, s0-s1}, 16
|
|
entry:
|
|
br label %while.body
|
|
|
|
while.body: ; preds = %while.body, %entry
|
|
%n.addr.042 = phi i32 [ 1, %entry ], [ 0, %while.body ]
|
|
br i1 %0, label %while.body, label %while.end
|
|
|
|
while.end: ; preds = %while.body
|
|
%or5 = mul i32 %_c, 16843009
|
|
store i32 %or5, ptr null, align 4
|
|
%1 = and i32 %n.addr.042, 1
|
|
%scevgep = getelementptr i8, ptr %incdec.ptr, i32 %1
|
|
store i8 %conv14, ptr %scevgep, align 1
|
|
ret ptr %s
|
|
}
|