Files
clang-p2996/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll
Sam Parker a761ba0f2d [ARM][ParallelDSP] Change search for muls
rL369567 reverted a couple of recent changes made to ARMParallelDSP
because of a miscompilation error: PR43073.

The issue stemmed from an underlying bug that was caused by adding
muls into a reduction before it was proved that they could be executed
in parallel with another mul.

Most of the changes here are from the previously reverted commits.
The additional changes have been made area:
1) The Search function now doesn't insert any muls into the Reduction
   object. That now happens once the search has successfully finished.
2) For any muls added into the reduction but that weren't paired, we
   accumulate their values as an input into the smlad.

Differential Revision: https://reviews.llvm.org/D66660

llvm-svn: 370171
2019-08-28 08:51:13 +00:00

77 lines
3.2 KiB
LLVM

; REQUIRES: asserts
; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S -stats 2>&1 | FileCheck %s
;
; A more complicated chain: 4 mul operations, so we expect 2 smlad calls.
;
; CHECK: %mac1{{\.}}054 = phi i32 [ [[V17:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
; CHECK: [[V10:%[0-9]+]] = bitcast i16* %arrayidx to i32*
; CHECK: [[V11:%[0-9]+]] = load i32, i32* [[V10]], align 2
; CHECK: [[V15:%[0-9]+]] = bitcast i16* %arrayidx4 to i32*
; CHECK: [[V16:%[0-9]+]] = load i32, i32* [[V15]], align 2
; CHECK: [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32*
; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2
; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32*
; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2
; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 %mac1{{\.}}054)
; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 [[V12]])
;
; And we don't want to see a 3rd smlad:
; CHECK-NOT: call i32 @llvm.arm.smlad
;
; CHECK: 2 arm-parallel-dsp - Number of smlad instructions generated
;
define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
entry:
%cmp52 = icmp sgt i32 %arg, 0
br i1 %cmp52, label %for.body.preheader, label %for.cond.cleanup
for.cond.cleanup:
%mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add28, %for.body ]
ret i32 %mac1.0.lcssa
for.body.preheader:
br label %for.body
for.body:
%mac1.054 = phi i32 [ %add28, %for.body ], [ 0, %for.body.preheader ]
%i.053 = phi i32 [ %add29, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.053
%0 = load i16, i16* %arrayidx, align 2
%add1 = or i32 %i.053, 1
%arrayidx2 = getelementptr inbounds i16, i16* %arg3, i32 %add1
%1 = load i16, i16* %arrayidx2, align 2
%add3 = or i32 %i.053, 2
%arrayidx4 = getelementptr inbounds i16, i16* %arg3, i32 %add3
%2 = load i16, i16* %arrayidx4, align 2
%add5 = or i32 %i.053, 3
%arrayidx6 = getelementptr inbounds i16, i16* %arg3, i32 %add5
%3 = load i16, i16* %arrayidx6, align 2
%arrayidx8 = getelementptr inbounds i16, i16* %arg2, i32 %i.053
%4 = load i16, i16* %arrayidx8, align 2
%conv = sext i16 %4 to i32
%conv9 = sext i16 %0 to i32
%mul = mul nsw i32 %conv, %conv9
%arrayidx11 = getelementptr inbounds i16, i16* %arg2, i32 %add1
%5 = load i16, i16* %arrayidx11, align 2
%conv12 = sext i16 %5 to i32
%conv13 = sext i16 %1 to i32
%mul14 = mul nsw i32 %conv12, %conv13
%arrayidx17 = getelementptr inbounds i16, i16* %arg2, i32 %add3
%6 = load i16, i16* %arrayidx17, align 2
%conv18 = sext i16 %6 to i32
%conv19 = sext i16 %2 to i32
%mul20 = mul nsw i32 %conv18, %conv19
%arrayidx23 = getelementptr inbounds i16, i16* %arg2, i32 %add5
%7 = load i16, i16* %arrayidx23, align 2
%conv24 = sext i16 %7 to i32
%conv25 = sext i16 %3 to i32
%mul26 = mul nsw i32 %conv24, %conv25
%add15 = add i32 %mul, %mac1.054
%add21 = add i32 %add15, %mul14
%add27 = add i32 %add21, %mul20
%add28 = add i32 %add27, %mul26
%add29 = add nuw nsw i32 %i.053, 4
%cmp = icmp slt i32 %add29, %arg
br i1 %cmp, label %for.body, label %for.cond.cleanup
}