This expands upon the inloop reductions added in e9761688e41cb9e976, allowing them to be inserted into tail folded loops. Reductions are generates with the form: x = select(mask, vecop, zero) v = vecreduce.add(x) c = add chain, v Where zero here is chosen as the identity value for add reductions. The backend is then expected to fold the select and the vecreduce into a single predicated instruction. Most of the code is fairly straight forward, except for the creation of blockmasks which need to ensure they are created in dominance order. The order they are added is altered to be after any phis, keeping the requirements for the underlying IR. Differential Revision: https://reviews.llvm.org/D84451
954 lines
50 KiB
LLVM
954 lines
50 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -loop-vectorize -instcombine -simplifycfg -tail-predication=enabled < %s -S -o - | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
|
|
target triple = "thumbv8.1m.main-arm-none-eabi"
|
|
|
|
define i64 @add_i64_i64(i64* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i64_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[X:%.*]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[TMP0]], [[R_07]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i64, i64* %x, i32 %i.08
|
|
%0 = load i64, i64* %arrayidx, align 8
|
|
%add = add nsw i64 %0, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; FIXME: 4x
|
|
define i64 @add_i32_i64(i32* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i32_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP0]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%0 = load i32, i32* %arrayidx, align 4
|
|
%conv = sext i32 %0 to i64
|
|
%add = add nsw i64 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; FIXME: 4x ?
|
|
define i64 @add_i16_i64(i16* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i16_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.08
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%conv = sext i16 %0 to i64
|
|
%add = add nsw i64 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; FIXME: 4x ?
|
|
define i64 @add_i8_i64(i8* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
|
|
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_07]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.08
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i64
|
|
%add = add nuw nsw i64 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @add_i32_i32(i32* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i32_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%0 = load i32, i32* %arrayidx, align 4
|
|
%add = add nsw i32 %0, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; FIXME: 8x
|
|
define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i16_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP1]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.08
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%add = add nsw i32 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; FIXME: 16x
|
|
define i32 @add_i8_i32(i8* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.08
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%add = add nuw nsw i32 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define signext i16 @add_i16_i16(i16* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i16_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i16 [[TMP3]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.010
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%add = add i16 %0, %r.09
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
; FIXME: 16x ?
|
|
define signext i16 @add_i8_i16(i8* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP2]], <8 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i16 [[TMP4]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.010
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i16
|
|
%add = add i16 %r.09, %conv
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
define zeroext i8 @add_i8_i8(i8* nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i8(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[WIDE_MASKED_LOAD]], <16 x i8> zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i8 [[TMP3]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i8 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp7 = icmp sgt i32 %n, 0
|
|
br i1 %cmp7, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.08 = phi i8 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.09
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%add = add i8 %0, %r.08
|
|
%inc = add nuw nsw i32 %i.09, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i8 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i8 %r.0.lcssa
|
|
}
|
|
|
|
define i64 @mla_i64_i64(i64* nocapture readonly %x, i64* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i64_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[X:%.*]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[Y:%.*]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]]
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[R_09]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i64, i64* %x, i32 %i.010
|
|
%0 = load i64, i64* %arrayidx, align 8
|
|
%arrayidx1 = getelementptr inbounds i64, i64* %y, i32 %i.010
|
|
%1 = load i64, i64* %arrayidx1, align 8
|
|
%mul = mul nsw i64 %1, %0
|
|
%add = add nsw i64 %mul, %r.09
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
define i64 @mla_i32_i64(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i32_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]]
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_09]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.010
|
|
%0 = load i32, i32* %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds i32, i32* %y, i32 %i.010
|
|
%1 = load i32, i32* %arrayidx1, align 4
|
|
%mul = mul nsw i32 %1, %0
|
|
%conv = sext i32 %mul to i64
|
|
%add = add nsw i64 %r.09, %conv
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
define i64 @mla_i16_i64(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i16_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2
|
|
; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP1]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
|
|
; CHECK-NEXT: [[CONV3:%.*]] = sext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_011]], [[CONV3]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp10 = icmp sgt i32 %n, 0
|
|
br i1 %cmp10, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.011 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.012
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.012
|
|
%1 = load i16, i16* %arrayidx1, align 2
|
|
%conv2 = sext i16 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%conv3 = sext i32 %mul to i64
|
|
%add = add nsw i64 %r.011, %conv3
|
|
%inc = add nuw nsw i32 %i.012, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
define i64 @mla_i8_i64(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
|
|
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
|
|
; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV2]], [[CONV]]
|
|
; CHECK-NEXT: [[CONV3:%.*]] = zext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_011]], [[CONV3]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp10 = icmp sgt i32 %n, 0
|
|
br i1 %cmp10, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.011 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.012
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.012
|
|
%1 = load i8, i8* %arrayidx1, align 1
|
|
%conv2 = zext i8 %1 to i32
|
|
%mul = mul nuw nsw i32 %conv2, %conv
|
|
%conv3 = zext i32 %mul to i64
|
|
%add = add nuw nsw i64 %r.011, %conv3
|
|
%inc = add nuw nsw i32 %i.012, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @mla_i32_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i32_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.010
|
|
%0 = load i32, i32* %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds i32, i32* %y, i32 %i.010
|
|
%1 = load i32, i32* %arrayidx1, align 4
|
|
%mul = mul nsw i32 %1, %0
|
|
%add = add nsw i32 %mul, %r.09
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @mla_i16_i32(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i16_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP1]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP4]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> undef)
|
|
; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD1]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
|
|
; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp9 = icmp sgt i32 %n, 0
|
|
br i1 %cmp9, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.011
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.011
|
|
%1 = load i16, i16* %arrayidx1, align 2
|
|
%conv2 = sext i16 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%add = add nsw i32 %mul, %r.010
|
|
%inc = add nuw nsw i32 %i.011, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @mla_i8_i32(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef)
|
|
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
|
|
; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP9:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp9 = icmp sgt i32 %n, 0
|
|
br i1 %cmp9, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.011
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.011
|
|
%1 = load i8, i8* %arrayidx1, align 1
|
|
%conv2 = zext i8 %1 to i32
|
|
%mul = mul nuw nsw i32 %conv2, %conv
|
|
%add = add nuw nsw i32 %mul, %r.010
|
|
%inc = add nuw nsw i32 %i.011, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define signext i16 @mla_i16_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i16_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <8 x i16>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP3]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> undef)
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP4]], <8 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i16 [[TMP6]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp11 = icmp sgt i32 %n, 0
|
|
br i1 %cmp11, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.013 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.012 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.013
|
|
%0 = load i16, i16* %arrayidx, align 2
|
|
%arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.013
|
|
%1 = load i16, i16* %arrayidx1, align 2
|
|
%mul = mul i16 %1, %0
|
|
%add = add i16 %mul, %r.012
|
|
%inc = add nuw nsw i32 %i.013, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
define signext i16 @mla_i8_i16(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP4]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> undef)
|
|
; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD1]] to <8 x i16>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw <8 x i16> [[TMP5]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP6]], <8 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP7]])
|
|
; CHECK-NEXT: [[TMP9]] = add i16 [[TMP8]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP11:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp11 = icmp sgt i32 %n, 0
|
|
br i1 %cmp11, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.013 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.012 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.013
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i16
|
|
%arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.013
|
|
%1 = load i8, i8* %arrayidx1, align 1
|
|
%conv2 = zext i8 %1 to i16
|
|
%mul = mul nuw i16 %conv2, %conv
|
|
%add = add i16 %mul, %r.012
|
|
%inc = add nuw nsw i32 %i.013, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
define zeroext i8 @mla_i8_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i8(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP10]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef)
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP4]], <16 x i8> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i8 [[TMP6]], [[VEC_PHI]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i8 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp10 = icmp sgt i32 %n, 0
|
|
br i1 %cmp10, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.011 = phi i8 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.012
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.012
|
|
%1 = load i8, i8* %arrayidx1, align 1
|
|
%mul = mul i8 %1, %0
|
|
%add = add i8 %mul, %r.011
|
|
%inc = add nuw nsw i32 %i.012, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i8 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i8 %r.0.lcssa
|
|
}
|
|
|
|
attributes #0 = { "target-features"="+mve" }
|