The patch attempts to optimize a sequence of SIMD loads from the same
base pointer:
%0 = gep float*, float* base, i32 4
%1 = bitcast float* %0 to <4 x float>*
%2 = load <4 x float>, <4 x float>* %1
...
%n1 = gep float*, float* base, i32 N
%n2 = bitcast float* %n1 to <4 x float>*
%n3 = load <4 x float>, <4 x float>* %n2
For AArch64 the compiler generates a sequence of LDR Qt, [Xn, #16].
However, 32-bit NEON VLD1/VST1 lack the [Wn, #imm] addressing mode, so
the address is computed before every ld/st instruction:
add r2, r0, #32
add r0, r0, #16
vld1.32 {d18, d19}, [r2]
vld1.32 {d22, d23}, [r0]
This can be improved by computing address for the first load, and then
using a post-indexed form of VLD1/VST1 to load the rest:
add r0, r0, #16
vld1.32 {d18, d19}, [r0]!
vld1.32 {d22, d23}, [r0]
In order to do that, the patch adds more patterns to DAGCombine:
- (load (add ptr inc1)) and (add ptr inc2) are now folded if inc1
and inc2 are constants.
- (or ptr inc) is now recognized as a pointer increment if ptr is
sufficiently aligned.
In addition to that, we now search for all possible base updates and
then pick the best one.
Differential Revision: https://reviews.llvm.org/D108988
60 lines
2.3 KiB
LLVM
60 lines
2.3 KiB
LLVM
; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s
|
|
|
|
; rdar://12713765
|
|
; When realign-stack is set to false, make sure we are not creating stack
|
|
; objects that are assumed to be 64-byte aligned.
|
|
|
|
define void @test1(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp "no-realign-stack" {
|
|
; CHECK-LABEL: test1:
|
|
; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
|
|
; CHECK: mov r[[NOTALIGNED:[0-9]+]], sp
|
|
; CHECK: add r[[NOTALIGNED]], r[[NOTALIGNED]], #32
|
|
; CHECK: add r[[PTR]], r[[PTR]], #32
|
|
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
|
|
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
|
|
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
|
|
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
|
|
entry:
|
|
%retval = alloca <16 x float>, align 64
|
|
%a1 = bitcast <16 x float>* %retval to float*
|
|
%a2 = getelementptr inbounds float, float* %a1, i64 8
|
|
%a3 = bitcast float* %a2 to <4 x float>*
|
|
|
|
%b1 = bitcast <16 x float>* %agg.result to float*
|
|
%b2 = getelementptr inbounds float, float* %b1, i64 8
|
|
%b3 = bitcast float* %b2 to <4 x float>*
|
|
|
|
%0 = load <4 x float>, <4 x float>* %a3, align 16
|
|
%1 = load <4 x float>, <4 x float>* %b3, align 16
|
|
store <4 x float> %0, <4 x float>* %b3, align 16
|
|
store <4 x float> %1, <4 x float>* %a3, align 16
|
|
ret void
|
|
}
|
|
|
|
define void @test2(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp {
|
|
; CHECK-LABEL: test2:
|
|
; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
|
|
; CHECK: mov r[[ALIGNED:[0-9]+]], sp
|
|
; CHECK: orr r[[ALIGNED]], r[[ALIGNED]], #32
|
|
; CHECK: add r[[PTR]], r[[PTR]], #32
|
|
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
|
|
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
|
|
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
|
|
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
|
|
entry:
|
|
%retval = alloca <16 x float>, align 64
|
|
%a1 = bitcast <16 x float>* %retval to float*
|
|
%a2 = getelementptr inbounds float, float* %a1, i64 8
|
|
%a3 = bitcast float* %a2 to <4 x float>*
|
|
|
|
%b1 = bitcast <16 x float>* %agg.result to float*
|
|
%b2 = getelementptr inbounds float, float* %b1, i64 8
|
|
%b3 = bitcast float* %b2 to <4 x float>*
|
|
|
|
%0 = load <4 x float>, <4 x float>* %a3, align 16
|
|
%1 = load <4 x float>, <4 x float>* %b3, align 16
|
|
store <4 x float> %0, <4 x float>* %b3, align 16
|
|
store <4 x float> %1, <4 x float>* %a3, align 16
|
|
ret void
|
|
}
|