Files
clang-p2996/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
David Green 091244023a [ARM] Move VPTBlock pass after post-ra scheduling
Currently when tail predicating loops, vpt blocks need to be created
with the vctp predicate in case we need to revert to non-tail predicated
form. This has the unfortunate side effect of severely hampering post-ra
scheduling at times as the instructions are already stuck in vpt blocks,
not allowed to be independently ordered.

This patch addresses that by just moving the creation of VPT blocks
later in the pipeline, after post-ra scheduling has been performed. This
allows more optimal scheduling post-ra before the vpt blocks are
created, leading to more optimal tail predicated loops.

Differential Revision: https://reviews.llvm.org/D113094
2021-11-04 18:42:12 +00:00

471 lines
21 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled --verify-machineinstrs %s -o - | FileCheck %s
define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
; CHECK-LABEL: vpsel_mul_reduce_add:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: itt eq
; CHECK-NEXT: moveq r0, #0
; CHECK-NEXT: bxeq lr
; CHECK-NEXT: .LBB0_1: @ %vector.ph
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: add.w r12, r3, #3
; CHECK-NEXT: mov.w lr, #1
; CHECK-NEXT: bic r12, r12, #3
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: add.w lr, lr, r12, lsr #2
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: and r4, r12, #15
; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q1, [r2], #16
; CHECK-NEXT: vldrwt.u32 q2, [r1], #16
; CHECK-NEXT: vdup.32 q3, r4
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vpt.i32 eq, q3, zr
; CHECK-NEXT: vmovt q1, q2
; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vmul.i32 q1, q1, q2
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: pop {r4, pc}
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
%wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
%tmp6 = bitcast i32* %tmp5 to <4 x i32>*
%wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%rem = urem i32 %index, 16
%rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
%rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
%wide.masked.load = select <4 x i1> %cmp, <4 x i32> %wide.masked.load.b, <4 x i32> %wide.masked.load.c
%mul = mul nsw <4 x i32> %wide.masked.load, %wide.masked.load.a
%add = add nsw <4 x i32> %mul, %vec.phi
%index.next = add i32 %index, 4
%tmp7 = icmp eq i32 %index.next, %n.vec
br i1 %tmp7, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
%tmp9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp8)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ]
ret i32 %res.0.lcssa
}
define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
; CHECK-LABEL: vpsel_mul_reduce_add_2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: ldr.w r12, [sp, #32]
; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq .LBB1_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: add.w r4, r12, #3
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: bic r4, r4, #3
; CHECK-NEXT: sub.w lr, r4, #4
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: add.w lr, r4, lr, lsr #2
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: and r5, r4, #15
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
; CHECK-NEXT: vldrwt.u32 q2, [r3], #16
; CHECK-NEXT: vldrwt.u32 q3, [r2], #16
; CHECK-NEXT: vdup.32 q4, r5
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vpt.i32 eq, q4, zr
; CHECK-NEXT: vsubt.i32 q1, q3, q2
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vmul.i32 q1, q1, q2
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r4, r5, r7, pc}
i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
%wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
%tmp6 = bitcast i32* %tmp5 to <4 x i32>*
%wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
%tmp8 = bitcast i32* %tmp7 to <4 x i32>*
%wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%sub = sub <4 x i32> %wide.masked.load.c, %wide.masked.load.d
%rem = urem i32 %index, 16
%rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
%rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
%sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %wide.masked.load.b
%mul = mul <4 x i32> %sel, %wide.masked.load.a
%add = add <4 x i32> %mul, %vec.phi
%index.next = add i32 %index, 4
%cmp.exit = icmp eq i32 %index.next, %n.vec
br i1 %cmp.exit, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
%reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
ret i32 %res.0.lcssa
}
define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
; CHECK-LABEL: and_mul_reduce_add:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: ldr.w r12, [sp, #12]
; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq .LBB2_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: add.w lr, r12, #3
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: bic lr, lr, #3
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: sub.w lr, lr, #4
; CHECK-NEXT: add.w r4, r4, lr, lsr #2
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vsub.i32 q1, q2, q1
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vcmpt.i32 eq, q1, zr
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
; CHECK-NEXT: vldrwt.u32 q2, [r2], #16
; CHECK-NEXT: vmul.i32 q1, q2, q1
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB2_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .LBB2_4:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop {r4, pc}
i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
%wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
%cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
%mask = and <4 x i1> %cmp, %tmp1
%tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
%tmp6 = bitcast i32* %tmp5 to <4 x i32>*
%wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
%tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
%tmp8 = bitcast i32* %tmp7 to <4 x i32>*
%wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
%mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d
%add = add <4 x i32> %mul, %vec.phi
%index.next = add i32 %index, 4
%cmp.exit = icmp eq i32 %index.next, %n.vec
br i1 %cmp.exit, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
%reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
ret i32 %res.0.lcssa
}
define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
; CHECK-LABEL: or_mul_reduce_add:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: ldr.w r12, [sp, #12]
; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq .LBB3_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: add.w lr, r12, #3
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: bic lr, lr, #3
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: sub.w lr, lr, #4
; CHECK-NEXT: add.w r4, r4, lr, lsr #2
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vpnot
; CHECK-NEXT: vsub.i32 q1, q2, q1
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vpstee
; CHECK-NEXT: vcmpt.i32 ne, q1, zr
; CHECK-NEXT: vldrwe.u32 q1, [r3], #16
; CHECK-NEXT: vldrwe.u32 q2, [r2], #16
; CHECK-NEXT: vmul.i32 q1, q2, q1
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB3_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .LBB3_4:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop {r4, pc}
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
%wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
%cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
%mask = or <4 x i1> %cmp, %tmp1
%tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
%tmp6 = bitcast i32* %tmp5 to <4 x i32>*
%wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
%tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
%tmp8 = bitcast i32* %tmp7 to <4 x i32>*
%wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
%mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d
%add = add <4 x i32> %mul, %vec.phi
%index.next = add i32 %index, 4
%cmp.exit = icmp eq i32 %index.next, %n.vec
br i1 %cmp.exit, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
%reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
ret i32 %res.0.lcssa
}
define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2) {
; CHECK-LABEL: continue_on_zero:
; CHECK: @ %bb.0: @ %bb
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB4_1: @ %bb3
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB4_2: @ %bb9
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vpt.i32 ne, q0, zr
; CHECK-NEXT: vldrwt.u32 q1, [r0]
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB4_2
; CHECK-NEXT: @ %bb.3: @ %bb27
; CHECK-NEXT: pop {r7, pc}
bb:
%tmp = icmp eq i32 %arg2, 0
br i1 %tmp, label %bb27, label %bb3
bb3: ; preds = %bb
%tmp4 = add i32 %arg2, 3
%tmp5 = and i32 %tmp4, -4
br label %bb9
bb9: ; preds = %bb9, %bb3
%tmp10 = phi i32 [ 0, %bb3 ], [ %tmp25, %bb9 ]
%tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10
%tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %arg2)
%tmp16 = bitcast i32* %tmp14 to <4 x i32>*
%tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef)
%tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer
%tmp19 = getelementptr inbounds i32, i32* %arg, i32 %tmp10
%tmp20 = and <4 x i1> %tmp18, %tmp15
%tmp21 = bitcast i32* %tmp19 to <4 x i32>*
%tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp21, i32 4, <4 x i1> %tmp20, <4 x i32> undef)
%tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
%tmp24 = bitcast i32* %tmp19 to <4 x i32>*
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %tmp24, i32 4, <4 x i1> %tmp20)
%tmp25 = add i32 %tmp10, 4
%tmp26 = icmp eq i32 %tmp25, %tmp5
br i1 %tmp26, label %bb27, label %bb9
bb27: ; preds = %bb9, %bb
ret void
}
define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2, i32 %arg3) {
; CHECK-LABEL: range_test:
; CHECK: @ %bb.0: @ %bb
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB5_1: @ %bb4
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB5_2: @ %bb12
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vpt.i32 ne, q0, zr
; CHECK-NEXT: vcmpt.s32 le, q0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB5_2
; CHECK-NEXT: @ %bb.3: @ %bb32
; CHECK-NEXT: pop {r7, pc}
bb:
%tmp = icmp eq i32 %arg3, 0
br i1 %tmp, label %bb32, label %bb4
bb4: ; preds = %bb
%tmp5 = add i32 %arg3, 3
%tmp6 = and i32 %tmp5, -4
%tmp10 = insertelement <4 x i32> undef, i32 %arg2, i32 0
%tmp11 = shufflevector <4 x i32> %tmp10, <4 x i32> undef, <4 x i32> zeroinitializer
br label %bb12
bb12: ; preds = %bb12, %bb4
%tmp13 = phi i32 [ 0, %bb4 ], [ %tmp30, %bb12 ]
%tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13
%tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %arg3)
%tmp19 = bitcast i32* %tmp17 to <4 x i32>*
%tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef)
%tmp21 = icmp ne <4 x i32> %tmp20, zeroinitializer
%tmp22 = icmp sle <4 x i32> %tmp20, %tmp11
%tmp23 = getelementptr inbounds i32, i32* %arg1, i32 %tmp13
%tmp24 = and <4 x i1> %tmp22, %tmp21
%tmp25 = and <4 x i1> %tmp24, %tmp18
%tmp26 = bitcast i32* %tmp23 to <4 x i32>*
%tmp27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp26, i32 4, <4 x i1> %tmp25, <4 x i32> undef)
%tmp28 = mul nsw <4 x i32> %tmp27, %tmp20
%tmp29 = bitcast i32* %tmp17 to <4 x i32>*
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp28, <4 x i32>* %tmp29, i32 4, <4 x i1> %tmp25)
%tmp30 = add i32 %tmp13, 4
%tmp31 = icmp eq i32 %tmp30, %tmp6
br i1 %tmp31, label %bb32, label %bb12
bb32: ; preds = %bb12, %bb
ret void
}
; Function Attrs: argmemonly nounwind readonly willreturn
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
; Function Attrs: nounwind readnone willreturn
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)