This adds cost modelling for the inloop vectorization added in
745bf6cf44. Up until now they have been modelled as the original
underlying instruction, usually an add. This happens to works OK for MVE
with instructions that are reducing into the same type as they are
working on. But MVE's instructions can perform the equivalent of an
extended MLA as a single instruction:
%sa = sext <16 x i8> A to <16 x i32>
%sb = sext <16 x i8> B to <16 x i32>
%m = mul <16 x i32> %sa, %sb
%r = vecreduce.add(%m)
->
R = VMLADAV A, B
There are other instructions for performing add reductions of
v4i32/v8i16/v16i8 into i32 (VADDV), for doing the same with v4i32->i64
(VADDLV) and for performing a v4i32/v8i16 MLA into an i64 (VMLALDAV).
The i64 are particularly interesting as there are no native i64 add/mul
instructions, leading to the i64 add and mul naturally getting very
high costs.
Also worth mentioning, under NEON there is the concept of a sdot/udot
instruction which performs a partial reduction from a v16i8 to a v4i32.
They extend and mul/sum the first four elements from the inputs into the
first element of the output, repeating for each of the four output
lanes. They could possibly be represented in the same way as above in
llvm, so long as a vecreduce.add could perform a partial reduction. The
vectorizer would then produce a combination of in and outer loop
reductions to efficiently use the sdot and udot instructions. Although
this patch does not do that yet, it does suggest that separating the
input reduction type from the produced result type is a useful concept
to model. It also shows that a MLA reduction as a single instruction is
fairly common.
This patch attempt to improve the costmodelling of in-loop reductions
by:
- Adding some pattern matching in the loop vectorizer cost model to
match extended reduction patterns that are optionally extended and/or
MLA patterns. This marks the cost of the reduction instruction correctly
and the sext/zext/mul leading up to it as free, which is otherwise
difficult to tell and may get a very high cost. (In the long run this
can hopefully be replaced by vplan producing a single node and costing
it correctly, but that is not yet something that vplan can do).
- getExtendedAddReductionCost is added to query the cost of these
extended reduction patterns.
- Expanded the ARM costs to account for these expanded sizes, which is a
fairly simple change in itself.
- Some minor alterations to allow inloop reduction larger than the highest
vector width and i64 MVE reductions.
- An extra InLoopReductionImmediateChains map was added to the vectorizer
for it to efficiently detect which instructions are reductions in the
cost model.
- The tests have some updates to show what I believe is optimal
vectorization and where we are now.
Put together this can greatly improve performance for reduction loop
under MVE.
Differential Revision: https://reviews.llvm.org/D93476
1221 lines
67 KiB
LLVM
1221 lines
67 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -loop-vectorize -instcombine -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -tail-predication=enabled < %s -S -o - | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
|
|
target triple = "thumbv8.1m.main-arm-none-eabi"
|
|
|
|
; Should not be vectorized
|
|
define i64 @add_i64_i64(i64* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i64_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[X:%.*]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[TMP0]], [[R_07]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i64, i64* %x, i32 %i.08
|
|
%0 = load i64, i64* %arrayidx, align 8
|
|
%add = add nsw i64 %0, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VADDLV
|
|
; FIXME: TailPredicate
|
|
define i64 @add_i32_i64(i32* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i32_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP6]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%0 = load i32, i32* %arrayidx, align 4
|
|
%conv = sext i32 %0 to i64
|
|
%add = add nsw i64 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VADDLV
|
|
; FIXME: TailPredicate
|
|
define i64 @add_i16_i64(i16* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i16_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>*
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP6]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.08
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%conv = sext i16 %0 to i64
|
|
%add = add nsw i64 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VADDLV
|
|
; FIXME: TailPredicate
|
|
define i64 @add_i8_i64(i8* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
|
|
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP6]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_07]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.08
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i64
|
|
%add = add nuw nsw i64 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VADDV.u32
|
|
define i32 @add_i32_i32(i32* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i32_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%0 = load i32, i32* %arrayidx, align 4
|
|
%add = add nsw i32 %0, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VADDV.u16
|
|
define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i16_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP9:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.08
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%add = add nsw i32 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VADDV.u16
|
|
define i32 @add_i8_i32(i8* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.08
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%add = add nuw nsw i32 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VADDV.u16
|
|
define signext i16 @add_i16_i16(i16* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i16_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i16 [[TMP3]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP11:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.010
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%add = add i16 %0, %r.09
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VADDV.u8
|
|
define signext i16 @add_i8_i16(i8* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP2]], <16 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i16 [[TMP4]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.010
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i16
|
|
%add = add i16 %r.09, %conv
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VADDV.u8
|
|
define zeroext i8 @add_i8_i8(i8* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i8(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[WIDE_MASKED_LOAD]], <16 x i8> zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i8 [[TMP3]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i8 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp7 = icmp sgt i32 %n, 0
|
|
br i1 %cmp7, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.08 = phi i8 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.09
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%add = add i8 %0, %r.08
|
|
%inc = add nuw nsw i32 %i.09, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i8 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i8 %r.0.lcssa
|
|
}
|
|
|
|
; Not vectorized
|
|
define i64 @mla_i64_i64(i64* nocapture readonly %x, i64* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i64_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[X:%.*]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[Y:%.*]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]]
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[R_09]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i64, i64* %x, i32 %i.010
|
|
%0 = load i64, i64* %arrayidx, align 8
|
|
%arrayidx1 = getelementptr inbounds i64, i64* %y, i32 %i.010
|
|
%1 = load i64, i64* %arrayidx1, align 8
|
|
%mul = mul nsw i64 %1, %0
|
|
%add = add nsw i64 %mul, %r.09
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VMLAL.u32
|
|
; FIXME: TailPredicate
|
|
define i64 @mla_i32_i64(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i32_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
|
|
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP4]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[Y]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], [[TMP9]]
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_09]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP15:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.010
|
|
%0 = load i32, i32* %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds i32, i32* %y, i32 %i.010
|
|
%1 = load i32, i32* %arrayidx1, align 4
|
|
%mul = mul nsw i32 %1, %0
|
|
%conv = sext i32 %mul to i64
|
|
%add = add nsw i64 %r.09, %conv
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VMLAL.u16
|
|
; FIXME: TailPredicate
|
|
define i64 @mla_i16_i64(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i16_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 2
|
|
; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]])
|
|
; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP11]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[Y]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2
|
|
; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP12]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
|
|
; CHECK-NEXT: [[CONV3:%.*]] = sext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_011]], [[CONV3]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP17:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp10 = icmp sgt i32 %n, 0
|
|
br i1 %cmp10, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.011 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.012
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.012
|
|
%1 = load i16, i16* %arrayidx1, align 2
|
|
%conv2 = sext i16 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%conv3 = sext i32 %mul to i64
|
|
%add = add nsw i64 %r.011, %conv3
|
|
%inc = add nuw nsw i32 %i.012, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VMLAL.u16
|
|
; FIXME: 8x, TailPredicate, double-extended
|
|
define i64 @mla_i8_i64(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
|
|
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = zext <16 x i32> [[TMP6]] to <16 x i64>
|
|
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP7]])
|
|
; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
|
|
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
|
|
; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP12]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV2]], [[CONV]]
|
|
; CHECK-NEXT: [[CONV3:%.*]] = zext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_011]], [[CONV3]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP19:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp10 = icmp sgt i32 %n, 0
|
|
br i1 %cmp10, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.011 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.012
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.012
|
|
%1 = load i8, i8* %arrayidx1, align 1
|
|
%conv2 = zext i8 %1 to i32
|
|
%mul = mul nuw nsw i32 %conv2, %conv
|
|
%conv3 = zext i32 %mul to i64
|
|
%add = add nuw nsw i64 %r.011, %conv3
|
|
%inc = add nuw nsw i32 %i.012, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VMLA.u32
|
|
define i32 @mla_i32_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i32_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.010
|
|
%0 = load i32, i32* %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds i32, i32* %y, i32 %i.010
|
|
%1 = load i32, i32* %arrayidx1, align 4
|
|
%mul = mul nsw i32 %1, %0
|
|
%add = add nsw i32 %mul, %r.09
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VMLA.u16
|
|
define i32 @mla_i16_i32(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i16_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP4]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP6]], <8 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
|
|
; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP21:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp9 = icmp sgt i32 %n, 0
|
|
br i1 %cmp9, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.011
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.011
|
|
%1 = load i16, i16* %arrayidx1, align 2
|
|
%conv2 = sext i16 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%add = add nsw i32 %mul, %r.010
|
|
%inc = add nuw nsw i32 %i.011, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VMLA.u8
|
|
define i32 @mla_i8_i32(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP7]])
|
|
; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp9 = icmp sgt i32 %n, 0
|
|
br i1 %cmp9, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.011
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.011
|
|
%1 = load i8, i8* %arrayidx1, align 1
|
|
%conv2 = zext i8 %1 to i32
|
|
%mul = mul nuw nsw i32 %conv2, %conv
|
|
%add = add nuw nsw i32 %mul, %r.010
|
|
%inc = add nuw nsw i32 %i.011, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VMLA.u16
|
|
define signext i16 @mla_i16_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i16_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP3]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP4]], <8 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i16 [[TMP6]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP23:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp11 = icmp sgt i32 %n, 0
|
|
br i1 %cmp11, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.013 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.012 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.013
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.013
|
|
%1 = load i16, i16* %arrayidx1, align 2
|
|
%mul = mul i16 %1, %0
|
|
%add = add i16 %mul, %r.012
|
|
%inc = add nuw nsw i32 %i.013, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VMLA.u8
|
|
define signext i16 @mla_i8_i16(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw <16 x i16> [[TMP5]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP6]], <16 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP7]])
|
|
; CHECK-NEXT: [[TMP9]] = add i16 [[TMP8]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp11 = icmp sgt i32 %n, 0
|
|
br i1 %cmp11, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.013 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.012 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.013
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i16
|
|
%arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.013
|
|
%1 = load i8, i8* %arrayidx1, align 1
|
|
%conv2 = zext i8 %1 to i16
|
|
%mul = mul nuw i16 %conv2, %conv
|
|
%add = add i16 %mul, %r.012
|
|
%inc = add nuw nsw i32 %i.013, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VMLA.u8
|
|
define zeroext i8 @mla_i8_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i8(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP10]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP4]], <16 x i8> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i8 [[TMP6]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP25:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i8 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp10 = icmp sgt i32 %n, 0
|
|
br i1 %cmp10, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.011 = phi i8 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.012
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.012
|
|
%1 = load i8, i8* %arrayidx1, align 1
|
|
%mul = mul i8 %1, %0
|
|
%add = add i8 %mul, %r.011
|
|
%inc = add nuw nsw i32 %i.012, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i8 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i8 %r.0.lcssa
|
|
}
|
|
|
|
; Make sure interleave group members feeding in-loop reductions can be handled.
|
|
define i32 @reduction_interleave_group(i32 %n, i32* %arr) #0 {
|
|
; CHECK-LABEL: @reduction_interleave_group(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[GUARD:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[GUARD]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
|
|
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 6
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -4
|
|
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC]], 1
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
|
|
; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[OFFSET_IDX]], 1
|
|
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i32 -1
|
|
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 [[TMP3]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
|
|
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4
|
|
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC1]])
|
|
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC]])
|
|
; CHECK-NEXT: [[TMP10]] = add i32 [[TMP9]], [[TMP8]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[RED_PHI:%.*]] = phi i32 [ [[RED_2:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ADD:%.*]] = or i32 [[IV]], 1
|
|
; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i32 [[ADD]]
|
|
; CHECK-NEXT: [[L_0:%.*]] = load i32, i32* [[GEP_0]], align 4
|
|
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i32 [[IV]]
|
|
; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[GEP_1]], align 4
|
|
; CHECK-NEXT: [[RED_1:%.*]] = add i32 [[L_0]], [[RED_PHI]]
|
|
; CHECK-NEXT: [[RED_2]] = add i32 [[RED_1]], [[L_1]]
|
|
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 2
|
|
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
|
|
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], [[LOOP27:!llvm.loop !.*]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: [[RET_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RED_2]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i32 [[RET_LCSSA]]
|
|
;
|
|
entry:
|
|
%guard = icmp sgt i32 %n, 0
|
|
br i1 %guard , label %for.body, label %exit
|
|
|
|
for.body: ; preds = %for.body.preheader, %for.body
|
|
%iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
|
|
%red.phi = phi i32 [ %red.2, %for.body ], [ 0, %entry ]
|
|
%add = or i32 %iv, 1
|
|
%gep.0 = getelementptr inbounds i32, i32* %arr, i32 %add
|
|
%l.0 = load i32, i32* %gep.0, align 4
|
|
%gep.1 = getelementptr inbounds i32, i32* %arr, i32 %iv
|
|
%l.1 = load i32, i32* %gep.1, align 4
|
|
%red.1 = add i32 %l.0, %red.phi
|
|
%red.2 = add i32 %red.1, %l.1
|
|
%iv.next = add nuw nsw i32 %iv, 2
|
|
%cmp = icmp slt i32 %iv.next, %n
|
|
br i1 %cmp, label %for.body, label %exit
|
|
|
|
exit:
|
|
%ret.lcssa = phi i32 [ 0, %entry ], [ %red.2, %for.body ]
|
|
ret i32 %ret.lcssa
|
|
}
|
|
|
|
attributes #0 = { "target-features"="+mve" }
|