Files
clang-p2996/llvm/test/Transforms/LoopVectorize/ARM/mve-reduce.ll
David Green b535aa405a [ARM] Use reduction intrinsics for larger than legal reductions
The codegen for splitting a llvm.vector.reduction intrinsic into parts
will be better than the codegen for the generic reductions. This will
only directly effect when vectorization factors are specified by the
user.

Also added tests to make sure the codegen for larger reductions is OK.

Differential Revision: https://reviews.llvm.org/D72257
2020-01-24 17:07:24 +00:00

63 lines
2.4 KiB
LLVM

; RUN: opt -loop-vectorize < %s -S -o - | FileCheck %s
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8.1m.main-arm-none-eabi"
; CHECK-LABEL: check4
; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32
define i32 @check4(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i32 %n) #0 {
entry:
%cmp9 = icmp sgt i32 %n, 0
br i1 %cmp9, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%res.0.lcssa = phi i32 [ undef, %entry ], [ %add, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %entry, %for.body
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%res.010 = phi i32 [ %add, %for.body ], [ undef, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011
%0 = load i8, i8* %arrayidx, align 1
%conv = sext i8 %0 to i32
%arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.011
%1 = load i8, i8* %arrayidx1, align 1
%conv2 = sext i8 %1 to i32
%mul = mul nsw i32 %conv2, %conv
%add = add nsw i32 %mul, %res.010
%inc = add nuw nsw i32 %i.011, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: check16
; CHECK: call i32 @llvm.experimental.vector.reduce.add.v16i32
define i32 @check16(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i32 %n) #0 {
entry:
%cmp9 = icmp sgt i32 %n, 0
br i1 %cmp9, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%res.0.lcssa = phi i32 [ undef, %entry ], [ %add, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %entry, %for.body
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%res.010 = phi i32 [ %add, %for.body ], [ undef, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011
%0 = load i8, i8* %arrayidx, align 1
%conv = sext i8 %0 to i32
%arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.011
%1 = load i8, i8* %arrayidx1, align 1
%conv2 = sext i8 %1 to i32
%mul = mul nsw i32 %conv2, %conv
%add = add nsw i32 %mul, %res.010
%inc = add nuw nsw i32 %i.011, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6
}
attributes #0 = { "target-features"="+mve" }
!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.vectorize.width", i32 16}