Files
clang-p2996/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll
Sam Parker 1c3ca61294 [ARM][ParallelDSP] Change smlad insertion order
Instead of inserting everything after the 'root' of the reduction,
insert all instructions as close to their operands as possible. This
can help reduce register pressure.

Differential Revision: https://reviews.llvm.org/D67392

llvm-svn: 374981
2019-10-16 09:37:03 +00:00

77 lines
3.2 KiB
LLVM

; REQUIRES: asserts
; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S -stats 2>&1 | FileCheck %s
;
; A more complicated chain: 4 mul operations, so we expect 2 smlad calls.
;
; CHECK: %mac1{{\.}}054 = phi i32 [ [[V17:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
; CHECK: [[V10:%[0-9]+]] = bitcast i16* %arrayidx to i32*
; CHECK: [[V11:%[0-9]+]] = load i32, i32* [[V10]], align 2
; CHECK: [[V15:%[0-9]+]] = bitcast i16* %arrayidx4 to i32*
; CHECK: [[V16:%[0-9]+]] = load i32, i32* [[V15]], align 2
; CHECK: [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32*
; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2
; CHECK: [[ACC:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054)
; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32*
; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2
; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[ACC]])
;
; And we don't want to see a 3rd smlad:
; CHECK-NOT: call i32 @llvm.arm.smlad
;
; CHECK: 2 arm-parallel-dsp - Number of smlad instructions generated
;
define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
entry:
%cmp52 = icmp sgt i32 %arg, 0
br i1 %cmp52, label %for.body.preheader, label %for.cond.cleanup
for.cond.cleanup:
%mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add28, %for.body ]
ret i32 %mac1.0.lcssa
for.body.preheader:
br label %for.body
for.body:
%mac1.054 = phi i32 [ %add28, %for.body ], [ 0, %for.body.preheader ]
%i.053 = phi i32 [ %add29, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.053
%0 = load i16, i16* %arrayidx, align 2
%add1 = or i32 %i.053, 1
%arrayidx2 = getelementptr inbounds i16, i16* %arg3, i32 %add1
%1 = load i16, i16* %arrayidx2, align 2
%add3 = or i32 %i.053, 2
%arrayidx4 = getelementptr inbounds i16, i16* %arg3, i32 %add3
%2 = load i16, i16* %arrayidx4, align 2
%add5 = or i32 %i.053, 3
%arrayidx6 = getelementptr inbounds i16, i16* %arg3, i32 %add5
%3 = load i16, i16* %arrayidx6, align 2
%arrayidx8 = getelementptr inbounds i16, i16* %arg2, i32 %i.053
%4 = load i16, i16* %arrayidx8, align 2
%conv = sext i16 %4 to i32
%conv9 = sext i16 %0 to i32
%mul = mul nsw i32 %conv, %conv9
%arrayidx11 = getelementptr inbounds i16, i16* %arg2, i32 %add1
%5 = load i16, i16* %arrayidx11, align 2
%conv12 = sext i16 %5 to i32
%conv13 = sext i16 %1 to i32
%mul14 = mul nsw i32 %conv12, %conv13
%arrayidx17 = getelementptr inbounds i16, i16* %arg2, i32 %add3
%6 = load i16, i16* %arrayidx17, align 2
%conv18 = sext i16 %6 to i32
%conv19 = sext i16 %2 to i32
%mul20 = mul nsw i32 %conv18, %conv19
%arrayidx23 = getelementptr inbounds i16, i16* %arg2, i32 %add5
%7 = load i16, i16* %arrayidx23, align 2
%conv24 = sext i16 %7 to i32
%conv25 = sext i16 %3 to i32
%mul26 = mul nsw i32 %conv24, %conv25
%add15 = add i32 %mul, %mac1.054
%add21 = add i32 %add15, %mul14
%add27 = add i32 %add21, %mul20
%add28 = add i32 %add27, %mul26
%add29 = add nuw nsw i32 %i.053, 4
%cmp = icmp slt i32 %add29, %arg
br i1 %cmp, label %for.body, label %for.cond.cleanup
}