These tests rely on SCEV looking recognizing an "or" with no common bits as an "add". Add the disjoint flag to relevant or instructions in preparation for switching SCEV to use the flag instead of the ValueTracking query. The IR with disjoint flag matches what InstCombine would produce.
1882 lines
77 KiB
LLVM
1882 lines
77 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 %s -o - | FileCheck %s
|
|
|
|
define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
|
|
; CHECK-LABEL: float_float_mul:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #0
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: bxeq lr
|
|
; CHECK-NEXT: .LBB0_1: @ %for.body.preheader
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhi .LBB0_3
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: b .LBB0_4
|
|
; CHECK-NEXT: .LBB0_3: @ %vector.memcheck
|
|
; CHECK-NEXT: add.w r7, r1, r3, lsl #2
|
|
; CHECK-NEXT: add.w r6, r2, r3, lsl #2
|
|
; CHECK-NEXT: cmp r7, r2
|
|
; CHECK-NEXT: add.w r5, r0, r3, lsl #2
|
|
; CHECK-NEXT: cset r7, hi
|
|
; CHECK-NEXT: cmp r6, r1
|
|
; CHECK-NEXT: csel r7, zr, r7, ls
|
|
; CHECK-NEXT: cmp r6, r0
|
|
; CHECK-NEXT: cset r6, hi
|
|
; CHECK-NEXT: cmp r5, r2
|
|
; CHECK-NEXT: cset r5, hi
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: tst r5, r6
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: cmpeq r7, #0
|
|
; CHECK-NEXT: beq .LBB0_11
|
|
; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22
|
|
; CHECK-NEXT: mvn.w r7, r12
|
|
; CHECK-NEXT: add.w r8, r7, r3
|
|
; CHECK-NEXT: and r5, r3, #3
|
|
; CHECK-NEXT: wls lr, r5, .LBB0_7
|
|
; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
|
|
; CHECK-NEXT: add.w r4, r12, r5
|
|
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
|
|
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
|
|
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
|
|
; CHECK-NEXT: mov r12, r4
|
|
; CHECK-NEXT: .LBB0_6: @ %for.body.prol
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldmia r6!, {s0}
|
|
; CHECK-NEXT: vldmia r5!, {s2}
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstmia r7!, {s0}
|
|
; CHECK-NEXT: le lr, .LBB0_6
|
|
; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit
|
|
; CHECK-NEXT: cmp.w r8, #3
|
|
; CHECK-NEXT: blo .LBB0_10
|
|
; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w r3, r3, r12
|
|
; CHECK-NEXT: lsl.w r12, r12, #2
|
|
; CHECK-NEXT: .LBB0_9: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: add.w r7, r1, r12
|
|
; CHECK-NEXT: add.w r6, r0, r12
|
|
; CHECK-NEXT: add.w r5, r2, r12
|
|
; CHECK-NEXT: adds r0, #16
|
|
; CHECK-NEXT: vldr s0, [r7]
|
|
; CHECK-NEXT: adds r1, #16
|
|
; CHECK-NEXT: vldr s2, [r6]
|
|
; CHECK-NEXT: adds r2, #16
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5]
|
|
; CHECK-NEXT: vldr s0, [r7, #4]
|
|
; CHECK-NEXT: vldr s2, [r6, #4]
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5, #4]
|
|
; CHECK-NEXT: vldr s0, [r7, #8]
|
|
; CHECK-NEXT: vldr s2, [r6, #8]
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5, #8]
|
|
; CHECK-NEXT: vldr s0, [r7, #12]
|
|
; CHECK-NEXT: vldr s2, [r6, #12]
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5, #12]
|
|
; CHECK-NEXT: bne .LBB0_9
|
|
; CHECK-NEXT: .LBB0_10:
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: bx lr
|
|
; CHECK-NEXT: .LBB0_11: @ %vector.ph
|
|
; CHECK-NEXT: bic r12, r3, #3
|
|
; CHECK-NEXT: movs r6, #1
|
|
; CHECK-NEXT: sub.w r7, r12, #4
|
|
; CHECK-NEXT: mov r4, r0
|
|
; CHECK-NEXT: mov r5, r1
|
|
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
|
|
; CHECK-NEXT: mov r6, r2
|
|
; CHECK-NEXT: .LBB0_12: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r5], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4], #16
|
|
; CHECK-NEXT: vmul.f32 q0, q1, q0
|
|
; CHECK-NEXT: vstrb.8 q0, [r6], #16
|
|
; CHECK-NEXT: le lr, .LBB0_12
|
|
; CHECK-NEXT: @ %bb.13: @ %middle.block
|
|
; CHECK-NEXT: cmp r12, r3
|
|
; CHECK-NEXT: bne .LBB0_4
|
|
; CHECK-NEXT: b .LBB0_10
|
|
entry:
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
|
|
|
|
for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
|
|
%i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%0 = xor i32 %i.09.ph, -1
|
|
%1 = add i32 %0, %N
|
|
%xtraiter = and i32 %N, 3
|
|
%lcmp.mod = icmp eq i32 %xtraiter, 0
|
|
br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
|
|
|
|
for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
|
|
%i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
|
|
%prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
|
|
%arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
|
|
%2 = load float, ptr %arrayidx.prol, align 4
|
|
%arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
|
|
%3 = load float, ptr %arrayidx1.prol, align 4
|
|
%mul.prol = fmul float %2, %3
|
|
%arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
|
|
store float %mul.prol, ptr %arrayidx2.prol, align 4
|
|
%inc.prol = add nuw i32 %i.09.prol, 1
|
|
%prol.iter.sub = add i32 %prol.iter, -1
|
|
%prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
|
|
br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
|
|
|
|
for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
|
|
%i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
|
|
%4 = icmp ult i32 %1, 3
|
|
br i1 %4, label %for.cond.cleanup, label %for.body
|
|
|
|
vector.memcheck: ; preds = %for.body.preheader
|
|
%scevgep = getelementptr float, ptr %c, i32 %N
|
|
%scevgep13 = getelementptr float, ptr %a, i32 %N
|
|
%scevgep16 = getelementptr float, ptr %b, i32 %N
|
|
%bound0 = icmp ugt ptr %scevgep13, %c
|
|
%bound1 = icmp ugt ptr %scevgep, %a
|
|
%found.conflict = and i1 %bound0, %bound1
|
|
%bound018 = icmp ugt ptr %scevgep16, %c
|
|
%bound119 = icmp ugt ptr %scevgep, %b
|
|
%found.conflict20 = and i1 %bound018, %bound119
|
|
%conflict.rdx = or i1 %found.conflict, %found.conflict20
|
|
br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
|
|
|
|
vector.ph: ; preds = %vector.memcheck
|
|
%n.vec = and i32 %N, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%5 = getelementptr inbounds float, ptr %a, i32 %index
|
|
%wide.load = load <4 x float>, ptr %5, align 4
|
|
%6 = getelementptr inbounds float, ptr %b, i32 %index
|
|
%wide.load21 = load <4 x float>, ptr %6, align 4
|
|
%7 = fmul <4 x float> %wide.load, %wide.load21
|
|
%8 = getelementptr inbounds float, ptr %c, i32 %index
|
|
store <4 x float> %7, ptr %8, align 4
|
|
%index.next = add i32 %index, 4
|
|
%9 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %9, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
|
|
|
|
for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.prol.loopexit, %for.body
|
|
%i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
|
|
%arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
|
|
%10 = load float, ptr %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
|
|
%11 = load float, ptr %arrayidx1, align 4
|
|
%mul = fmul float %10, %11
|
|
%arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
|
|
store float %mul, ptr %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.09, 1
|
|
%arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
|
|
%12 = load float, ptr %arrayidx.1, align 4
|
|
%arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
|
|
%13 = load float, ptr %arrayidx1.1, align 4
|
|
%mul.1 = fmul float %12, %13
|
|
%arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
|
|
store float %mul.1, ptr %arrayidx2.1, align 4
|
|
%inc.1 = add nuw i32 %i.09, 2
|
|
%arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
|
|
%14 = load float, ptr %arrayidx.2, align 4
|
|
%arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
|
|
%15 = load float, ptr %arrayidx1.2, align 4
|
|
%mul.2 = fmul float %14, %15
|
|
%arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
|
|
store float %mul.2, ptr %arrayidx2.2, align 4
|
|
%inc.2 = add nuw i32 %i.09, 3
|
|
%arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
|
|
%16 = load float, ptr %arrayidx.3, align 4
|
|
%arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
|
|
%17 = load float, ptr %arrayidx1.3, align 4
|
|
%mul.3 = fmul float %16, %17
|
|
%arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
|
|
store float %mul.3, ptr %arrayidx2.3, align 4
|
|
%inc.3 = add nuw i32 %i.09, 4
|
|
%exitcond.3 = icmp eq i32 %inc.3, %N
|
|
br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
|
|
; CHECK-LABEL: float_float_add:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #0
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: bxeq lr
|
|
; CHECK-NEXT: .LBB1_1: @ %for.body.preheader
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhi .LBB1_3
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: b .LBB1_4
|
|
; CHECK-NEXT: .LBB1_3: @ %vector.memcheck
|
|
; CHECK-NEXT: add.w r7, r1, r3, lsl #2
|
|
; CHECK-NEXT: add.w r6, r2, r3, lsl #2
|
|
; CHECK-NEXT: cmp r7, r2
|
|
; CHECK-NEXT: add.w r5, r0, r3, lsl #2
|
|
; CHECK-NEXT: cset r7, hi
|
|
; CHECK-NEXT: cmp r6, r1
|
|
; CHECK-NEXT: csel r7, zr, r7, ls
|
|
; CHECK-NEXT: cmp r6, r0
|
|
; CHECK-NEXT: cset r6, hi
|
|
; CHECK-NEXT: cmp r5, r2
|
|
; CHECK-NEXT: cset r5, hi
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: tst r5, r6
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: cmpeq r7, #0
|
|
; CHECK-NEXT: beq .LBB1_11
|
|
; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22
|
|
; CHECK-NEXT: mvn.w r7, r12
|
|
; CHECK-NEXT: add.w r8, r7, r3
|
|
; CHECK-NEXT: and r5, r3, #3
|
|
; CHECK-NEXT: wls lr, r5, .LBB1_7
|
|
; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
|
|
; CHECK-NEXT: add.w r4, r12, r5
|
|
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
|
|
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
|
|
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
|
|
; CHECK-NEXT: mov r12, r4
|
|
; CHECK-NEXT: .LBB1_6: @ %for.body.prol
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldmia r6!, {s0}
|
|
; CHECK-NEXT: vldmia r5!, {s2}
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstmia r7!, {s0}
|
|
; CHECK-NEXT: le lr, .LBB1_6
|
|
; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit
|
|
; CHECK-NEXT: cmp.w r8, #3
|
|
; CHECK-NEXT: blo .LBB1_10
|
|
; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w r3, r3, r12
|
|
; CHECK-NEXT: lsl.w r12, r12, #2
|
|
; CHECK-NEXT: .LBB1_9: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: add.w r7, r1, r12
|
|
; CHECK-NEXT: add.w r6, r0, r12
|
|
; CHECK-NEXT: add.w r5, r2, r12
|
|
; CHECK-NEXT: adds r0, #16
|
|
; CHECK-NEXT: vldr s0, [r7]
|
|
; CHECK-NEXT: adds r1, #16
|
|
; CHECK-NEXT: vldr s2, [r6]
|
|
; CHECK-NEXT: adds r2, #16
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5]
|
|
; CHECK-NEXT: vldr s0, [r7, #4]
|
|
; CHECK-NEXT: vldr s2, [r6, #4]
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5, #4]
|
|
; CHECK-NEXT: vldr s0, [r7, #8]
|
|
; CHECK-NEXT: vldr s2, [r6, #8]
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5, #8]
|
|
; CHECK-NEXT: vldr s0, [r7, #12]
|
|
; CHECK-NEXT: vldr s2, [r6, #12]
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5, #12]
|
|
; CHECK-NEXT: bne .LBB1_9
|
|
; CHECK-NEXT: .LBB1_10:
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: bx lr
|
|
; CHECK-NEXT: .LBB1_11: @ %vector.ph
|
|
; CHECK-NEXT: bic r12, r3, #3
|
|
; CHECK-NEXT: movs r6, #1
|
|
; CHECK-NEXT: sub.w r7, r12, #4
|
|
; CHECK-NEXT: mov r4, r0
|
|
; CHECK-NEXT: mov r5, r1
|
|
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
|
|
; CHECK-NEXT: mov r6, r2
|
|
; CHECK-NEXT: .LBB1_12: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r5], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4], #16
|
|
; CHECK-NEXT: vadd.f32 q0, q1, q0
|
|
; CHECK-NEXT: vstrb.8 q0, [r6], #16
|
|
; CHECK-NEXT: le lr, .LBB1_12
|
|
; CHECK-NEXT: @ %bb.13: @ %middle.block
|
|
; CHECK-NEXT: cmp r12, r3
|
|
; CHECK-NEXT: bne .LBB1_4
|
|
; CHECK-NEXT: b .LBB1_10
|
|
entry:
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
|
|
|
|
for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
|
|
%i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%0 = xor i32 %i.09.ph, -1
|
|
%1 = add i32 %0, %N
|
|
%xtraiter = and i32 %N, 3
|
|
%lcmp.mod = icmp eq i32 %xtraiter, 0
|
|
br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
|
|
|
|
for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
|
|
%i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
|
|
%prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
|
|
%arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
|
|
%2 = load float, ptr %arrayidx.prol, align 4
|
|
%arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
|
|
%3 = load float, ptr %arrayidx1.prol, align 4
|
|
%add.prol = fadd float %2, %3
|
|
%arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
|
|
store float %add.prol, ptr %arrayidx2.prol, align 4
|
|
%inc.prol = add nuw i32 %i.09.prol, 1
|
|
%prol.iter.sub = add i32 %prol.iter, -1
|
|
%prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
|
|
br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
|
|
|
|
for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
|
|
%i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
|
|
%4 = icmp ult i32 %1, 3
|
|
br i1 %4, label %for.cond.cleanup, label %for.body
|
|
|
|
vector.memcheck: ; preds = %for.body.preheader
|
|
%scevgep = getelementptr float, ptr %c, i32 %N
|
|
%scevgep13 = getelementptr float, ptr %a, i32 %N
|
|
%scevgep16 = getelementptr float, ptr %b, i32 %N
|
|
%bound0 = icmp ugt ptr %scevgep13, %c
|
|
%bound1 = icmp ugt ptr %scevgep, %a
|
|
%found.conflict = and i1 %bound0, %bound1
|
|
%bound018 = icmp ugt ptr %scevgep16, %c
|
|
%bound119 = icmp ugt ptr %scevgep, %b
|
|
%found.conflict20 = and i1 %bound018, %bound119
|
|
%conflict.rdx = or i1 %found.conflict, %found.conflict20
|
|
br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
|
|
|
|
vector.ph: ; preds = %vector.memcheck
|
|
%n.vec = and i32 %N, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%5 = getelementptr inbounds float, ptr %a, i32 %index
|
|
%wide.load = load <4 x float>, ptr %5, align 4
|
|
%6 = getelementptr inbounds float, ptr %b, i32 %index
|
|
%wide.load21 = load <4 x float>, ptr %6, align 4
|
|
%7 = fadd <4 x float> %wide.load, %wide.load21
|
|
%8 = getelementptr inbounds float, ptr %c, i32 %index
|
|
store <4 x float> %7, ptr %8, align 4
|
|
%index.next = add i32 %index, 4
|
|
%9 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %9, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
|
|
|
|
for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.prol.loopexit, %for.body
|
|
%i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
|
|
%arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
|
|
%10 = load float, ptr %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
|
|
%11 = load float, ptr %arrayidx1, align 4
|
|
%add = fadd float %10, %11
|
|
%arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
|
|
store float %add, ptr %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.09, 1
|
|
%arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
|
|
%12 = load float, ptr %arrayidx.1, align 4
|
|
%arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
|
|
%13 = load float, ptr %arrayidx1.1, align 4
|
|
%add.1 = fadd float %12, %13
|
|
%arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
|
|
store float %add.1, ptr %arrayidx2.1, align 4
|
|
%inc.1 = add nuw i32 %i.09, 2
|
|
%arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
|
|
%14 = load float, ptr %arrayidx.2, align 4
|
|
%arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
|
|
%15 = load float, ptr %arrayidx1.2, align 4
|
|
%add.2 = fadd float %14, %15
|
|
%arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
|
|
store float %add.2, ptr %arrayidx2.2, align 4
|
|
%inc.2 = add nuw i32 %i.09, 3
|
|
%arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
|
|
%16 = load float, ptr %arrayidx.3, align 4
|
|
%arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
|
|
%17 = load float, ptr %arrayidx1.3, align 4
|
|
%add.3 = fadd float %16, %17
|
|
%arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
|
|
store float %add.3, ptr %arrayidx2.3, align 4
|
|
%inc.3 = add nuw i32 %i.09, 4
|
|
%exitcond.3 = icmp eq i32 %inc.3, %N
|
|
br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
|
|
; CHECK-LABEL: float_float_sub:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #0
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: bxeq lr
|
|
; CHECK-NEXT: .LBB2_1: @ %for.body.preheader
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhi .LBB2_3
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: b .LBB2_4
|
|
; CHECK-NEXT: .LBB2_3: @ %vector.memcheck
|
|
; CHECK-NEXT: add.w r7, r1, r3, lsl #2
|
|
; CHECK-NEXT: add.w r6, r2, r3, lsl #2
|
|
; CHECK-NEXT: cmp r7, r2
|
|
; CHECK-NEXT: add.w r5, r0, r3, lsl #2
|
|
; CHECK-NEXT: cset r7, hi
|
|
; CHECK-NEXT: cmp r6, r1
|
|
; CHECK-NEXT: csel r7, zr, r7, ls
|
|
; CHECK-NEXT: cmp r6, r0
|
|
; CHECK-NEXT: cset r6, hi
|
|
; CHECK-NEXT: cmp r5, r2
|
|
; CHECK-NEXT: cset r5, hi
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: tst r5, r6
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: cmpeq r7, #0
|
|
; CHECK-NEXT: beq .LBB2_11
|
|
; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22
|
|
; CHECK-NEXT: mvn.w r7, r12
|
|
; CHECK-NEXT: add.w r8, r7, r3
|
|
; CHECK-NEXT: and r5, r3, #3
|
|
; CHECK-NEXT: wls lr, r5, .LBB2_7
|
|
; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
|
|
; CHECK-NEXT: add.w r4, r12, r5
|
|
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
|
|
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
|
|
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
|
|
; CHECK-NEXT: mov r12, r4
|
|
; CHECK-NEXT: .LBB2_6: @ %for.body.prol
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldmia r6!, {s0}
|
|
; CHECK-NEXT: vldmia r5!, {s2}
|
|
; CHECK-NEXT: vsub.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstmia r7!, {s0}
|
|
; CHECK-NEXT: le lr, .LBB2_6
|
|
; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit
|
|
; CHECK-NEXT: cmp.w r8, #3
|
|
; CHECK-NEXT: blo .LBB2_10
|
|
; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w r3, r3, r12
|
|
; CHECK-NEXT: lsl.w r12, r12, #2
|
|
; CHECK-NEXT: .LBB2_9: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: add.w r7, r1, r12
|
|
; CHECK-NEXT: add.w r6, r0, r12
|
|
; CHECK-NEXT: add.w r5, r2, r12
|
|
; CHECK-NEXT: adds r0, #16
|
|
; CHECK-NEXT: vldr s0, [r7]
|
|
; CHECK-NEXT: adds r1, #16
|
|
; CHECK-NEXT: vldr s2, [r6]
|
|
; CHECK-NEXT: adds r2, #16
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vsub.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5]
|
|
; CHECK-NEXT: vldr s0, [r7, #4]
|
|
; CHECK-NEXT: vldr s2, [r6, #4]
|
|
; CHECK-NEXT: vsub.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5, #4]
|
|
; CHECK-NEXT: vldr s0, [r7, #8]
|
|
; CHECK-NEXT: vldr s2, [r6, #8]
|
|
; CHECK-NEXT: vsub.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5, #8]
|
|
; CHECK-NEXT: vldr s0, [r7, #12]
|
|
; CHECK-NEXT: vldr s2, [r6, #12]
|
|
; CHECK-NEXT: vsub.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r5, #12]
|
|
; CHECK-NEXT: bne .LBB2_9
|
|
; CHECK-NEXT: .LBB2_10:
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: bx lr
|
|
; CHECK-NEXT: .LBB2_11: @ %vector.ph
|
|
; CHECK-NEXT: bic r12, r3, #3
|
|
; CHECK-NEXT: movs r6, #1
|
|
; CHECK-NEXT: sub.w r7, r12, #4
|
|
; CHECK-NEXT: mov r4, r0
|
|
; CHECK-NEXT: mov r5, r1
|
|
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
|
|
; CHECK-NEXT: mov r6, r2
|
|
; CHECK-NEXT: .LBB2_12: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r5], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4], #16
|
|
; CHECK-NEXT: vsub.f32 q0, q1, q0
|
|
; CHECK-NEXT: vstrb.8 q0, [r6], #16
|
|
; CHECK-NEXT: le lr, .LBB2_12
|
|
; CHECK-NEXT: @ %bb.13: @ %middle.block
|
|
; CHECK-NEXT: cmp r12, r3
|
|
; CHECK-NEXT: bne .LBB2_4
|
|
; CHECK-NEXT: b .LBB2_10
|
|
entry:
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
|
|
|
|
for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
|
|
%i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%0 = xor i32 %i.09.ph, -1
|
|
%1 = add i32 %0, %N
|
|
%xtraiter = and i32 %N, 3
|
|
%lcmp.mod = icmp eq i32 %xtraiter, 0
|
|
br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
|
|
|
|
for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
|
|
%i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
|
|
%prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
|
|
%arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
|
|
%2 = load float, ptr %arrayidx.prol, align 4
|
|
%arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
|
|
%3 = load float, ptr %arrayidx1.prol, align 4
|
|
%sub.prol = fsub float %2, %3
|
|
%arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
|
|
store float %sub.prol, ptr %arrayidx2.prol, align 4
|
|
%inc.prol = add nuw i32 %i.09.prol, 1
|
|
%prol.iter.sub = add i32 %prol.iter, -1
|
|
%prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
|
|
br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
|
|
|
|
for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
|
|
%i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
|
|
%4 = icmp ult i32 %1, 3
|
|
br i1 %4, label %for.cond.cleanup, label %for.body
|
|
|
|
vector.memcheck: ; preds = %for.body.preheader
|
|
%scevgep = getelementptr float, ptr %c, i32 %N
|
|
%scevgep13 = getelementptr float, ptr %a, i32 %N
|
|
%scevgep16 = getelementptr float, ptr %b, i32 %N
|
|
%bound0 = icmp ugt ptr %scevgep13, %c
|
|
%bound1 = icmp ugt ptr %scevgep, %a
|
|
%found.conflict = and i1 %bound0, %bound1
|
|
%bound018 = icmp ugt ptr %scevgep16, %c
|
|
%bound119 = icmp ugt ptr %scevgep, %b
|
|
%found.conflict20 = and i1 %bound018, %bound119
|
|
%conflict.rdx = or i1 %found.conflict, %found.conflict20
|
|
br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
|
|
|
|
vector.ph: ; preds = %vector.memcheck
|
|
%n.vec = and i32 %N, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%5 = getelementptr inbounds float, ptr %a, i32 %index
|
|
%wide.load = load <4 x float>, ptr %5, align 4
|
|
%6 = getelementptr inbounds float, ptr %b, i32 %index
|
|
%wide.load21 = load <4 x float>, ptr %6, align 4
|
|
%7 = fsub <4 x float> %wide.load, %wide.load21
|
|
%8 = getelementptr inbounds float, ptr %c, i32 %index
|
|
store <4 x float> %7, ptr %8, align 4
|
|
%index.next = add i32 %index, 4
|
|
%9 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %9, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
|
|
|
|
for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.prol.loopexit, %for.body
|
|
%i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
|
|
%arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
|
|
%10 = load float, ptr %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
|
|
%11 = load float, ptr %arrayidx1, align 4
|
|
%sub = fsub float %10, %11
|
|
%arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
|
|
store float %sub, ptr %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.09, 1
|
|
%arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
|
|
%12 = load float, ptr %arrayidx.1, align 4
|
|
%arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
|
|
%13 = load float, ptr %arrayidx1.1, align 4
|
|
%sub.1 = fsub float %12, %13
|
|
%arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
|
|
store float %sub.1, ptr %arrayidx2.1, align 4
|
|
%inc.1 = add nuw i32 %i.09, 2
|
|
%arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
|
|
%14 = load float, ptr %arrayidx.2, align 4
|
|
%arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
|
|
%15 = load float, ptr %arrayidx1.2, align 4
|
|
%sub.2 = fsub float %14, %15
|
|
%arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
|
|
store float %sub.2, ptr %arrayidx2.2, align 4
|
|
%inc.2 = add nuw i32 %i.09, 3
|
|
%arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
|
|
%16 = load float, ptr %arrayidx.3, align 4
|
|
%arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
|
|
%17 = load float, ptr %arrayidx1.3, align 4
|
|
%sub.3 = fsub float %16, %17
|
|
%arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
|
|
store float %sub.3, ptr %arrayidx2.3, align 4
|
|
%inc.3 = add nuw i32 %i.09, 4
|
|
%exitcond.3 = icmp eq i32 %inc.3, %N
|
|
br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
|
|
; CHECK-LABEL: float_int_mul:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #0
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: bxeq lr
|
|
; CHECK-NEXT: .LBB3_1: @ %for.body.preheader
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bls .LBB3_6
|
|
; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
|
|
; CHECK-NEXT: add.w r7, r0, r3, lsl #2
|
|
; CHECK-NEXT: cmp r7, r2
|
|
; CHECK-NEXT: itt hi
|
|
; CHECK-NEXT: addhi.w r7, r2, r3, lsl #2
|
|
; CHECK-NEXT: cmphi r7, r0
|
|
; CHECK-NEXT: bhi .LBB3_6
|
|
; CHECK-NEXT: @ %bb.3: @ %vector.ph
|
|
; CHECK-NEXT: bic r12, r3, #3
|
|
; CHECK-NEXT: movs r6, #1
|
|
; CHECK-NEXT: sub.w r7, r12, #4
|
|
; CHECK-NEXT: mov r4, r0
|
|
; CHECK-NEXT: mov r5, r1
|
|
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
|
|
; CHECK-NEXT: mov r6, r2
|
|
; CHECK-NEXT: .LBB3_4: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r5], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4], #16
|
|
; CHECK-NEXT: vcvt.f32.s32 q0, q0
|
|
; CHECK-NEXT: vmul.f32 q0, q1, q0
|
|
; CHECK-NEXT: vstrb.8 q0, [r6], #16
|
|
; CHECK-NEXT: le lr, .LBB3_4
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
; CHECK-NEXT: cmp r12, r3
|
|
; CHECK-NEXT: bne .LBB3_7
|
|
; CHECK-NEXT: b .LBB3_13
|
|
; CHECK-NEXT: .LBB3_6:
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16
|
|
; CHECK-NEXT: mvn.w r7, r12
|
|
; CHECK-NEXT: add.w r8, r7, r3
|
|
; CHECK-NEXT: and r5, r3, #3
|
|
; CHECK-NEXT: wls lr, r5, .LBB3_10
|
|
; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader
|
|
; CHECK-NEXT: add.w r4, r12, r5
|
|
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
|
|
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
|
|
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
|
|
; CHECK-NEXT: mov r12, r4
|
|
; CHECK-NEXT: .LBB3_9: @ %for.body.prol
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r4, [r6], #4
|
|
; CHECK-NEXT: vldmia r5!, {s2}
|
|
; CHECK-NEXT: vmov s0, r4
|
|
; CHECK-NEXT: vcvt.f32.s32 s0, s0
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstmia r7!, {s0}
|
|
; CHECK-NEXT: le lr, .LBB3_9
|
|
; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit
|
|
; CHECK-NEXT: cmp.w r8, #3
|
|
; CHECK-NEXT: blo .LBB3_13
|
|
; CHECK-NEXT: @ %bb.11: @ %for.body.preheader1
|
|
; CHECK-NEXT: add.w r1, r1, r12, lsl #2
|
|
; CHECK-NEXT: sub.w r3, r3, r12
|
|
; CHECK-NEXT: adds r1, #8
|
|
; CHECK-NEXT: lsl.w r12, r12, #2
|
|
; CHECK-NEXT: .LBB3_12: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldr s0, [r1, #-8]
|
|
; CHECK-NEXT: add.w r7, r0, r12
|
|
; CHECK-NEXT: add.w r6, r2, r12
|
|
; CHECK-NEXT: adds r0, #16
|
|
; CHECK-NEXT: vcvt.f32.s32 s0, s0
|
|
; CHECK-NEXT: vldr s2, [r7]
|
|
; CHECK-NEXT: adds r2, #16
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r6]
|
|
; CHECK-NEXT: vldr s0, [r1, #-4]
|
|
; CHECK-NEXT: vldr s2, [r7, #4]
|
|
; CHECK-NEXT: vcvt.f32.s32 s0, s0
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r6, #4]
|
|
; CHECK-NEXT: vldr s0, [r1]
|
|
; CHECK-NEXT: vldr s2, [r7, #8]
|
|
; CHECK-NEXT: vcvt.f32.s32 s0, s0
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r6, #8]
|
|
; CHECK-NEXT: vldr s0, [r1, #4]
|
|
; CHECK-NEXT: add.w r1, r1, #16
|
|
; CHECK-NEXT: vldr s2, [r7, #12]
|
|
; CHECK-NEXT: vcvt.f32.s32 s0, s0
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: vstr s0, [r6, #12]
|
|
; CHECK-NEXT: bne .LBB3_12
|
|
; CHECK-NEXT: .LBB3_13:
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
br i1 %min.iters.check, label %for.body.preheader16, label %vector.memcheck
|
|
|
|
for.body.preheader16: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
|
|
%i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%0 = xor i32 %i.09.ph, -1
|
|
%1 = add i32 %0, %N
|
|
%xtraiter = and i32 %N, 3
|
|
%lcmp.mod = icmp eq i32 %xtraiter, 0
|
|
br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
|
|
|
|
for.body.prol: ; preds = %for.body.preheader16, %for.body.prol
|
|
%i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader16 ]
|
|
%prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader16 ]
|
|
%arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
|
|
%2 = load float, ptr %arrayidx.prol, align 4
|
|
%arrayidx1.prol = getelementptr inbounds i32, ptr %b, i32 %i.09.prol
|
|
%3 = load i32, ptr %arrayidx1.prol, align 4
|
|
%conv.prol = sitofp i32 %3 to float
|
|
%mul.prol = fmul float %2, %conv.prol
|
|
%arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
|
|
store float %mul.prol, ptr %arrayidx2.prol, align 4
|
|
%inc.prol = add nuw i32 %i.09.prol, 1
|
|
%prol.iter.sub = add i32 %prol.iter, -1
|
|
%prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
|
|
br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
|
|
|
|
for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader16
|
|
%i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader16 ], [ %inc.prol, %for.body.prol ]
|
|
%4 = icmp ult i32 %1, 3
|
|
br i1 %4, label %for.cond.cleanup, label %for.body
|
|
|
|
vector.memcheck: ; preds = %for.body.preheader
|
|
%scevgep = getelementptr float, ptr %c, i32 %N
|
|
%scevgep13 = getelementptr float, ptr %a, i32 %N
|
|
%bound0 = icmp ugt ptr %scevgep13, %c
|
|
%bound1 = icmp ugt ptr %scevgep, %a
|
|
%found.conflict = and i1 %bound0, %bound1
|
|
br i1 %found.conflict, label %for.body.preheader16, label %vector.ph
|
|
|
|
vector.ph: ; preds = %vector.memcheck
|
|
%n.vec = and i32 %N, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%5 = getelementptr inbounds float, ptr %a, i32 %index
|
|
%wide.load = load <4 x float>, ptr %5, align 4
|
|
%6 = getelementptr inbounds i32, ptr %b, i32 %index
|
|
%wide.load15 = load <4 x i32>, ptr %6, align 4
|
|
%7 = sitofp <4 x i32> %wide.load15 to <4 x float>
|
|
%8 = fmul <4 x float> %wide.load, %7
|
|
%9 = getelementptr inbounds float, ptr %c, i32 %index
|
|
store <4 x float> %8, ptr %9, align 4
|
|
%index.next = add i32 %index, 4
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %10, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader16
|
|
|
|
for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.prol.loopexit, %for.body
|
|
%i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
|
|
%arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
|
|
%11 = load float, ptr %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.09
|
|
%12 = load i32, ptr %arrayidx1, align 4
|
|
%conv = sitofp i32 %12 to float
|
|
%mul = fmul float %11, %conv
|
|
%arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
|
|
store float %mul, ptr %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.09, 1
|
|
%arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
|
|
%13 = load float, ptr %arrayidx.1, align 4
|
|
%arrayidx1.1 = getelementptr inbounds i32, ptr %b, i32 %inc
|
|
%14 = load i32, ptr %arrayidx1.1, align 4
|
|
%conv.1 = sitofp i32 %14 to float
|
|
%mul.1 = fmul float %13, %conv.1
|
|
%arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
|
|
store float %mul.1, ptr %arrayidx2.1, align 4
|
|
%inc.1 = add nuw i32 %i.09, 2
|
|
%arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
|
|
%15 = load float, ptr %arrayidx.2, align 4
|
|
%arrayidx1.2 = getelementptr inbounds i32, ptr %b, i32 %inc.1
|
|
%16 = load i32, ptr %arrayidx1.2, align 4
|
|
%conv.2 = sitofp i32 %16 to float
|
|
%mul.2 = fmul float %15, %conv.2
|
|
%arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
|
|
store float %mul.2, ptr %arrayidx2.2, align 4
|
|
%inc.2 = add nuw i32 %i.09, 3
|
|
%arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
|
|
%17 = load float, ptr %arrayidx.3, align 4
|
|
%arrayidx1.3 = getelementptr inbounds i32, ptr %b, i32 %inc.2
|
|
%18 = load i32, ptr %arrayidx1.3, align 4
|
|
%conv.3 = sitofp i32 %18 to float
|
|
%mul.3 = fmul float %17, %conv.3
|
|
%arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
|
|
store float %mul.3, ptr %arrayidx2.3, align 4
|
|
%inc.3 = add nuw i32 %i.09, 4
|
|
%exitcond.3 = icmp eq i32 %inc.3, %N
|
|
br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @float_int_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
|
|
; CHECK-LABEL: float_int_int_mul:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
|
; CHECK-NEXT: cbz r3, .LBB4_8
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhi .LBB4_3
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: b .LBB4_6
|
|
; CHECK-NEXT: .LBB4_3: @ %vector.ph
|
|
; CHECK-NEXT: bic r12, r3, #3
|
|
; CHECK-NEXT: movs r5, #1
|
|
; CHECK-NEXT: sub.w r6, r12, #4
|
|
; CHECK-NEXT: mov r4, r0
|
|
; CHECK-NEXT: add.w lr, r5, r6, lsr #2
|
|
; CHECK-NEXT: mov r5, r1
|
|
; CHECK-NEXT: mov r6, r2
|
|
; CHECK-NEXT: .LBB4_4: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5], #16
|
|
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
|
; CHECK-NEXT: vcvt.f32.s32 q0, q0
|
|
; CHECK-NEXT: vstrb.8 q0, [r6], #16
|
|
; CHECK-NEXT: le lr, .LBB4_4
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
; CHECK-NEXT: cmp r12, r3
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
|
; CHECK-NEXT: .LBB4_6: @ %for.body.preheader11
|
|
; CHECK-NEXT: sub.w lr, r3, r12
|
|
; CHECK-NEXT: add.w r0, r0, r12, lsl #2
|
|
; CHECK-NEXT: add.w r1, r1, r12, lsl #2
|
|
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
|
|
; CHECK-NEXT: .LBB4_7: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r3, [r0], #4
|
|
; CHECK-NEXT: ldr r6, [r1], #4
|
|
; CHECK-NEXT: muls r3, r6, r3
|
|
; CHECK-NEXT: vmov s0, r3
|
|
; CHECK-NEXT: vcvt.f32.s32 s0, s0
|
|
; CHECK-NEXT: vstmia r2!, {s0}
|
|
; CHECK-NEXT: le lr, .LBB4_7
|
|
; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
entry:
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
|
|
|
|
for.body.preheader11: ; preds = %middle.block, %for.body.preheader
|
|
%i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
br label %for.body
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %N, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%0 = getelementptr inbounds i32, ptr %a, i32 %index
|
|
%wide.load = load <4 x i32>, ptr %0, align 4
|
|
%1 = getelementptr inbounds i32, ptr %b, i32 %index
|
|
%wide.load10 = load <4 x i32>, ptr %1, align 4
|
|
%2 = mul nsw <4 x i32> %wide.load10, %wide.load
|
|
%3 = sitofp <4 x i32> %2 to <4 x float>
|
|
%4 = getelementptr inbounds float, ptr %c, i32 %index
|
|
store <4 x float> %3, ptr %4, align 4
|
|
%index.next = add i32 %index, 4
|
|
%5 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %5, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader11, %for.body
|
|
%i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
|
|
%arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.09
|
|
%6 = load i32, ptr %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.09
|
|
%7 = load i32, ptr %arrayidx1, align 4
|
|
%mul = mul nsw i32 %7, %6
|
|
%conv = sitofp i32 %mul to float
|
|
%arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
|
|
store float %conv, ptr %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.09, 1
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @half_half_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
|
|
; CHECK-LABEL: half_half_mul:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
|
|
; CHECK-NEXT: cmp r3, #0
|
|
; CHECK-NEXT: beq .LBB5_8
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhi .LBB5_3
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: b .LBB5_6
|
|
; CHECK-NEXT: .LBB5_3: @ %vector.ph
|
|
; CHECK-NEXT: bic r12, r3, #3
|
|
; CHECK-NEXT: movs r5, #1
|
|
; CHECK-NEXT: sub.w r6, r12, #4
|
|
; CHECK-NEXT: mov r4, r0
|
|
; CHECK-NEXT: add.w lr, r5, r6, lsr #2
|
|
; CHECK-NEXT: mov r5, r1
|
|
; CHECK-NEXT: mov r6, r2
|
|
; CHECK-NEXT: .LBB5_4: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr.w r9, [r4]
|
|
; CHECK-NEXT: ldr r7, [r5]
|
|
; CHECK-NEXT: ldr.w r8, [r4, #4]
|
|
; CHECK-NEXT: vmov.32 q0[0], r9
|
|
; CHECK-NEXT: ldr.w r10, [r5, #4]
|
|
; CHECK-NEXT: vmov.32 q1[0], r7
|
|
; CHECK-NEXT: vmov.32 q0[1], r8
|
|
; CHECK-NEXT: adds r4, #8
|
|
; CHECK-NEXT: vmov.32 q1[1], r10
|
|
; CHECK-NEXT: adds r5, #8
|
|
; CHECK-NEXT: vmul.f16 q0, q0, q1
|
|
; CHECK-NEXT: vcvtt.f32.f16 s3, s1
|
|
; CHECK-NEXT: vcvtb.f32.f16 s2, s1
|
|
; CHECK-NEXT: vcvtt.f32.f16 s1, s0
|
|
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
|
|
; CHECK-NEXT: vstrb.8 q0, [r6], #16
|
|
; CHECK-NEXT: le lr, .LBB5_4
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
; CHECK-NEXT: cmp r12, r3
|
|
; CHECK-NEXT: beq .LBB5_8
|
|
; CHECK-NEXT: .LBB5_6: @ %for.body.preheader11
|
|
; CHECK-NEXT: sub.w lr, r3, r12
|
|
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
|
|
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
|
|
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
|
|
; CHECK-NEXT: .LBB5_7: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldr.16 s0, [r1]
|
|
; CHECK-NEXT: vldr.16 s2, [r0]
|
|
; CHECK-NEXT: adds r0, #2
|
|
; CHECK-NEXT: adds r1, #2
|
|
; CHECK-NEXT: vmul.f16 s0, s2, s0
|
|
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
|
|
; CHECK-NEXT: vstmia r2!, {s0}
|
|
; CHECK-NEXT: le lr, .LBB5_7
|
|
; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
|
|
entry:
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
|
|
|
|
for.body.preheader11: ; preds = %middle.block, %for.body.preheader
|
|
%i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
br label %for.body
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %N, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%0 = getelementptr inbounds half, ptr %a, i32 %index
|
|
%wide.load = load <4 x half>, ptr %0, align 2
|
|
%1 = getelementptr inbounds half, ptr %b, i32 %index
|
|
%wide.load10 = load <4 x half>, ptr %1, align 2
|
|
%2 = fmul <4 x half> %wide.load, %wide.load10
|
|
%3 = fpext <4 x half> %2 to <4 x float>
|
|
%4 = getelementptr inbounds float, ptr %c, i32 %index
|
|
store <4 x float> %3, ptr %4, align 4
|
|
%index.next = add i32 %index, 4
|
|
%5 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %5, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader11, %for.body
|
|
%i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
|
|
%arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
|
|
%6 = load half, ptr %arrayidx, align 2
|
|
%arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
|
|
%7 = load half, ptr %arrayidx1, align 2
|
|
%mul = fmul half %6, %7
|
|
%conv = fpext half %mul to float
|
|
%arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
|
|
store float %conv, ptr %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.09, 1
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @half_half_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
|
|
; CHECK-LABEL: half_half_add:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
|
|
; CHECK-NEXT: cmp r3, #0
|
|
; CHECK-NEXT: beq .LBB6_8
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhi .LBB6_3
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: b .LBB6_6
|
|
; CHECK-NEXT: .LBB6_3: @ %vector.ph
|
|
; CHECK-NEXT: bic r12, r3, #3
|
|
; CHECK-NEXT: movs r5, #1
|
|
; CHECK-NEXT: sub.w r6, r12, #4
|
|
; CHECK-NEXT: mov r4, r0
|
|
; CHECK-NEXT: add.w lr, r5, r6, lsr #2
|
|
; CHECK-NEXT: mov r5, r1
|
|
; CHECK-NEXT: mov r6, r2
|
|
; CHECK-NEXT: .LBB6_4: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr.w r9, [r4]
|
|
; CHECK-NEXT: ldr r7, [r5]
|
|
; CHECK-NEXT: ldr.w r8, [r4, #4]
|
|
; CHECK-NEXT: vmov.32 q0[0], r9
|
|
; CHECK-NEXT: ldr.w r10, [r5, #4]
|
|
; CHECK-NEXT: vmov.32 q1[0], r7
|
|
; CHECK-NEXT: vmov.32 q0[1], r8
|
|
; CHECK-NEXT: adds r4, #8
|
|
; CHECK-NEXT: vmov.32 q1[1], r10
|
|
; CHECK-NEXT: adds r5, #8
|
|
; CHECK-NEXT: vadd.f16 q0, q0, q1
|
|
; CHECK-NEXT: vcvtt.f32.f16 s3, s1
|
|
; CHECK-NEXT: vcvtb.f32.f16 s2, s1
|
|
; CHECK-NEXT: vcvtt.f32.f16 s1, s0
|
|
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
|
|
; CHECK-NEXT: vstrb.8 q0, [r6], #16
|
|
; CHECK-NEXT: le lr, .LBB6_4
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
; CHECK-NEXT: cmp r12, r3
|
|
; CHECK-NEXT: beq .LBB6_8
|
|
; CHECK-NEXT: .LBB6_6: @ %for.body.preheader11
|
|
; CHECK-NEXT: sub.w lr, r3, r12
|
|
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
|
|
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
|
|
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
|
|
; CHECK-NEXT: .LBB6_7: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldr.16 s0, [r1]
|
|
; CHECK-NEXT: vldr.16 s2, [r0]
|
|
; CHECK-NEXT: adds r0, #2
|
|
; CHECK-NEXT: adds r1, #2
|
|
; CHECK-NEXT: vadd.f16 s0, s2, s0
|
|
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
|
|
; CHECK-NEXT: vstmia r2!, {s0}
|
|
; CHECK-NEXT: le lr, .LBB6_7
|
|
; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
|
|
entry:
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
|
|
|
|
for.body.preheader11: ; preds = %middle.block, %for.body.preheader
|
|
%i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
br label %for.body
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %N, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%0 = getelementptr inbounds half, ptr %a, i32 %index
|
|
%wide.load = load <4 x half>, ptr %0, align 2
|
|
%1 = getelementptr inbounds half, ptr %b, i32 %index
|
|
%wide.load10 = load <4 x half>, ptr %1, align 2
|
|
%2 = fadd <4 x half> %wide.load, %wide.load10
|
|
%3 = fpext <4 x half> %2 to <4 x float>
|
|
%4 = getelementptr inbounds float, ptr %c, i32 %index
|
|
store <4 x float> %3, ptr %4, align 4
|
|
%index.next = add i32 %index, 4
|
|
%5 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %5, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader11, %for.body
|
|
%i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
|
|
%arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
|
|
%6 = load half, ptr %arrayidx, align 2
|
|
%arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
|
|
%7 = load half, ptr %arrayidx1, align 2
|
|
%add = fadd half %6, %7
|
|
%conv = fpext half %add to float
|
|
%arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
|
|
store float %conv, ptr %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.09, 1
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @half_half_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
|
|
; CHECK-LABEL: half_half_sub:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
|
|
; CHECK-NEXT: cmp r3, #0
|
|
; CHECK-NEXT: beq .LBB7_8
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhi .LBB7_3
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: b .LBB7_6
|
|
; CHECK-NEXT: .LBB7_3: @ %vector.ph
|
|
; CHECK-NEXT: bic r12, r3, #3
|
|
; CHECK-NEXT: movs r5, #1
|
|
; CHECK-NEXT: sub.w r6, r12, #4
|
|
; CHECK-NEXT: mov r4, r0
|
|
; CHECK-NEXT: add.w lr, r5, r6, lsr #2
|
|
; CHECK-NEXT: mov r5, r1
|
|
; CHECK-NEXT: mov r6, r2
|
|
; CHECK-NEXT: .LBB7_4: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr.w r9, [r4]
|
|
; CHECK-NEXT: ldr r7, [r5]
|
|
; CHECK-NEXT: ldr.w r8, [r4, #4]
|
|
; CHECK-NEXT: vmov.32 q0[0], r9
|
|
; CHECK-NEXT: ldr.w r10, [r5, #4]
|
|
; CHECK-NEXT: vmov.32 q1[0], r7
|
|
; CHECK-NEXT: vmov.32 q0[1], r8
|
|
; CHECK-NEXT: adds r4, #8
|
|
; CHECK-NEXT: vmov.32 q1[1], r10
|
|
; CHECK-NEXT: adds r5, #8
|
|
; CHECK-NEXT: vsub.f16 q0, q0, q1
|
|
; CHECK-NEXT: vcvtt.f32.f16 s3, s1
|
|
; CHECK-NEXT: vcvtb.f32.f16 s2, s1
|
|
; CHECK-NEXT: vcvtt.f32.f16 s1, s0
|
|
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
|
|
; CHECK-NEXT: vstrb.8 q0, [r6], #16
|
|
; CHECK-NEXT: le lr, .LBB7_4
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
; CHECK-NEXT: cmp r12, r3
|
|
; CHECK-NEXT: beq .LBB7_8
|
|
; CHECK-NEXT: .LBB7_6: @ %for.body.preheader11
|
|
; CHECK-NEXT: sub.w lr, r3, r12
|
|
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
|
|
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
|
|
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
|
|
; CHECK-NEXT: .LBB7_7: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldr.16 s0, [r1]
|
|
; CHECK-NEXT: vldr.16 s2, [r0]
|
|
; CHECK-NEXT: adds r0, #2
|
|
; CHECK-NEXT: adds r1, #2
|
|
; CHECK-NEXT: vsub.f16 s0, s2, s0
|
|
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
|
|
; CHECK-NEXT: vstmia r2!, {s0}
|
|
; CHECK-NEXT: le lr, .LBB7_7
|
|
; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
|
|
entry:
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
|
|
|
|
for.body.preheader11: ; preds = %middle.block, %for.body.preheader
|
|
%i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
br label %for.body
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %N, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%0 = getelementptr inbounds half, ptr %a, i32 %index
|
|
%wide.load = load <4 x half>, ptr %0, align 2
|
|
%1 = getelementptr inbounds half, ptr %b, i32 %index
|
|
%wide.load10 = load <4 x half>, ptr %1, align 2
|
|
%2 = fsub <4 x half> %wide.load, %wide.load10
|
|
%3 = fpext <4 x half> %2 to <4 x float>
|
|
%4 = getelementptr inbounds float, ptr %c, i32 %index
|
|
store <4 x float> %3, ptr %4, align 4
|
|
%index.next = add i32 %index, 4
|
|
%5 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %5, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader11, %for.body
|
|
%i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
|
|
%arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
|
|
%6 = load half, ptr %arrayidx, align 2
|
|
%arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
|
|
%7 = load half, ptr %arrayidx1, align 2
|
|
%sub = fsub half %6, %7
|
|
%conv = fpext half %sub to float
|
|
%arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
|
|
store float %conv, ptr %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.09, 1
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @half_short_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
|
|
; CHECK-LABEL: half_short_mul:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
|
|
; CHECK-NEXT: sub sp, #16
|
|
; CHECK-NEXT: cmp r3, #0
|
|
; CHECK-NEXT: beq .LBB8_8
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: mov r8, r2
|
|
; CHECK-NEXT: mov r9, r1
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhi .LBB8_3
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: b .LBB8_6
|
|
; CHECK-NEXT: .LBB8_3: @ %vector.ph
|
|
; CHECK-NEXT: bic r12, r3, #3
|
|
; CHECK-NEXT: movs r6, #1
|
|
; CHECK-NEXT: sub.w r7, r12, #4
|
|
; CHECK-NEXT: mov r1, sp
|
|
; CHECK-NEXT: mov r5, r0
|
|
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
|
|
; CHECK-NEXT: mov r6, r9
|
|
; CHECK-NEXT: mov r7, r8
|
|
; CHECK-NEXT: .LBB8_4: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrh.u32 q0, [r6], #8
|
|
; CHECK-NEXT: ldr r4, [r5]
|
|
; CHECK-NEXT: ldr r2, [r5, #4]
|
|
; CHECK-NEXT: adds r5, #8
|
|
; CHECK-NEXT: vstrh.32 q0, [r1]
|
|
; CHECK-NEXT: vmov.32 q1[0], r4
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
|
; CHECK-NEXT: vmov.32 q1[1], r2
|
|
; CHECK-NEXT: vcvt.f16.s16 q0, q0
|
|
; CHECK-NEXT: vmul.f16 q0, q1, q0
|
|
; CHECK-NEXT: vcvtt.f32.f16 s3, s1
|
|
; CHECK-NEXT: vcvtb.f32.f16 s2, s1
|
|
; CHECK-NEXT: vcvtt.f32.f16 s1, s0
|
|
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
|
|
; CHECK-NEXT: vstrb.8 q0, [r7], #16
|
|
; CHECK-NEXT: le lr, .LBB8_4
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
; CHECK-NEXT: cmp r12, r3
|
|
; CHECK-NEXT: beq .LBB8_8
|
|
; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13
|
|
; CHECK-NEXT: sub.w lr, r3, r12
|
|
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
|
|
; CHECK-NEXT: add.w r1, r9, r12, lsl #1
|
|
; CHECK-NEXT: add.w r2, r8, r12, lsl #2
|
|
; CHECK-NEXT: .LBB8_7: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldrsh r3, [r1], #2
|
|
; CHECK-NEXT: vldr.16 s0, [r0]
|
|
; CHECK-NEXT: adds r0, #2
|
|
; CHECK-NEXT: vmov s2, r3
|
|
; CHECK-NEXT: vcvt.f16.s32 s2, s2
|
|
; CHECK-NEXT: vmul.f16 s0, s0, s2
|
|
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
|
|
; CHECK-NEXT: vstmia r2!, {s0}
|
|
; CHECK-NEXT: le lr, .LBB8_7
|
|
; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup
|
|
; CHECK-NEXT: add sp, #16
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
|
|
entry:
|
|
%cmp10 = icmp eq i32 %N, 0
|
|
br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
br i1 %min.iters.check, label %for.body.preheader13, label %vector.ph
|
|
|
|
for.body.preheader13: ; preds = %middle.block, %for.body.preheader
|
|
%i.011.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
br label %for.body
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %N, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%0 = getelementptr inbounds half, ptr %a, i32 %index
|
|
%wide.load = load <4 x half>, ptr %0, align 2
|
|
%1 = getelementptr inbounds i16, ptr %b, i32 %index
|
|
%wide.load12 = load <4 x i16>, ptr %1, align 2
|
|
%2 = sitofp <4 x i16> %wide.load12 to <4 x half>
|
|
%3 = fmul <4 x half> %wide.load, %2
|
|
%4 = fpext <4 x half> %3 to <4 x float>
|
|
%5 = getelementptr inbounds float, ptr %c, i32 %index
|
|
store <4 x float> %4, ptr %5, align 4
|
|
%index.next = add i32 %index, 4
|
|
%6 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %6, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader13, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader13 ]
|
|
%arrayidx = getelementptr inbounds half, ptr %a, i32 %i.011
|
|
%7 = load half, ptr %arrayidx, align 2
|
|
%arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.011
|
|
%8 = load i16, ptr %arrayidx1, align 2
|
|
%conv2 = sitofp i16 %8 to half
|
|
%mul = fmul half %7, %conv2
|
|
%conv3 = fpext half %mul to float
|
|
%arrayidx4 = getelementptr inbounds float, ptr %c, i32 %i.011
|
|
store float %conv3, ptr %arrayidx4, align 4
|
|
%inc = add nuw i32 %i.011, 1
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc float @half_half_mac(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
|
|
; CHECK-LABEL: half_half_mac:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: push {r4, r5, r7, lr}
|
|
; CHECK-NEXT: cbz r2, .LBB9_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: subs r3, r2, #1
|
|
; CHECK-NEXT: and r12, r2, #3
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhs .LBB9_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vldr s0, .LCPI9_0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB9_6
|
|
; CHECK-NEXT: .LBB9_3:
|
|
; CHECK-NEXT: vldr s0, .LCPI9_0
|
|
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
|
; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new
|
|
; CHECK-NEXT: bic r2, r2, #3
|
|
; CHECK-NEXT: movs r3, #1
|
|
; CHECK-NEXT: subs r2, #4
|
|
; CHECK-NEXT: vldr s0, .LCPI9_0
|
|
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: .LBB9_5: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: adds r5, r0, r3
|
|
; CHECK-NEXT: adds r4, r1, r3
|
|
; CHECK-NEXT: vldr.16 s2, [r4, #6]
|
|
; CHECK-NEXT: vldr.16 s4, [r5, #6]
|
|
; CHECK-NEXT: vldr.16 s6, [r5, #4]
|
|
; CHECK-NEXT: vldr.16 s8, [r5, #2]
|
|
; CHECK-NEXT: vmul.f16 s2, s4, s2
|
|
; CHECK-NEXT: vldr.16 s4, [r4, #4]
|
|
; CHECK-NEXT: vldr.16 s10, [r5]
|
|
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
|
|
; CHECK-NEXT: vmul.f16 s4, s6, s4
|
|
; CHECK-NEXT: vldr.16 s6, [r4, #2]
|
|
; CHECK-NEXT: vcvtb.f32.f16 s4, s4
|
|
; CHECK-NEXT: adds r3, #8
|
|
; CHECK-NEXT: vmul.f16 s6, s8, s6
|
|
; CHECK-NEXT: vldr.16 s8, [r4]
|
|
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
|
|
; CHECK-NEXT: adds r2, #4
|
|
; CHECK-NEXT: vmul.f16 s8, s10, s8
|
|
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s8
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s6
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s4
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s2
|
|
; CHECK-NEXT: le lr, .LBB9_5
|
|
; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa
|
|
; CHECK-NEXT: wls lr, r12, .LBB9_9
|
|
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #1
|
|
; CHECK-NEXT: add.w r1, r1, r2, lsl #1
|
|
; CHECK-NEXT: .LBB9_8: @ %for.body.epil
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldr.16 s2, [r1]
|
|
; CHECK-NEXT: vldr.16 s4, [r0]
|
|
; CHECK-NEXT: adds r0, #2
|
|
; CHECK-NEXT: adds r1, #2
|
|
; CHECK-NEXT: vmul.f16 s2, s4, s2
|
|
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s2
|
|
; CHECK-NEXT: le lr, .LBB9_8
|
|
; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.10:
|
|
; CHECK-NEXT: .LCPI9_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
entry:
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%0 = add i32 %N, -1
|
|
%xtraiter = and i32 %N, 3
|
|
%1 = icmp ult i32 %0, 3
|
|
br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
|
|
|
|
for.body.preheader.new: ; preds = %for.body.preheader
|
|
%unroll_iter = sub i32 %N, %xtraiter
|
|
br label %for.body
|
|
|
|
for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
|
|
%add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
|
|
%i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
|
|
%res.09.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
|
|
%lcmp.mod = icmp eq i32 %xtraiter, 0
|
|
br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
|
|
|
|
for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
|
|
%i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
|
|
%res.09.epil = phi float [ %add.epil, %for.body.epil ], [ %res.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
|
|
%epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
|
|
%arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.010.epil
|
|
%2 = load half, ptr %arrayidx.epil, align 2
|
|
%arrayidx1.epil = getelementptr inbounds half, ptr %b, i32 %i.010.epil
|
|
%3 = load half, ptr %arrayidx1.epil, align 2
|
|
%mul.epil = fmul half %2, %3
|
|
%conv.epil = fpext half %mul.epil to float
|
|
%add.epil = fadd float %res.09.epil, %conv.epil
|
|
%inc.epil = add nuw i32 %i.010.epil, 1
|
|
%epil.iter.sub = add i32 %epil.iter, -1
|
|
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
|
|
br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
|
|
|
|
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
|
|
%res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
|
|
ret float %res.0.lcssa
|
|
|
|
for.body: ; preds = %for.body, %for.body.preheader.new
|
|
%i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
|
|
%res.09 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
|
|
%niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
|
|
%arrayidx = getelementptr inbounds half, ptr %a, i32 %i.010
|
|
%4 = load half, ptr %arrayidx, align 2
|
|
%arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.010
|
|
%5 = load half, ptr %arrayidx1, align 2
|
|
%mul = fmul half %4, %5
|
|
%conv = fpext half %mul to float
|
|
%add = fadd float %res.09, %conv
|
|
%inc = or disjoint i32 %i.010, 1
|
|
%arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
|
|
%6 = load half, ptr %arrayidx.1, align 2
|
|
%arrayidx1.1 = getelementptr inbounds half, ptr %b, i32 %inc
|
|
%7 = load half, ptr %arrayidx1.1, align 2
|
|
%mul.1 = fmul half %6, %7
|
|
%conv.1 = fpext half %mul.1 to float
|
|
%add.1 = fadd float %add, %conv.1
|
|
%inc.1 = or disjoint i32 %i.010, 2
|
|
%arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
|
|
%8 = load half, ptr %arrayidx.2, align 2
|
|
%arrayidx1.2 = getelementptr inbounds half, ptr %b, i32 %inc.1
|
|
%9 = load half, ptr %arrayidx1.2, align 2
|
|
%mul.2 = fmul half %8, %9
|
|
%conv.2 = fpext half %mul.2 to float
|
|
%add.2 = fadd float %add.1, %conv.2
|
|
%inc.2 = or disjoint i32 %i.010, 3
|
|
%arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
|
|
%10 = load half, ptr %arrayidx.3, align 2
|
|
%arrayidx1.3 = getelementptr inbounds half, ptr %b, i32 %inc.2
|
|
%11 = load half, ptr %arrayidx1.3, align 2
|
|
%mul.3 = fmul half %10, %11
|
|
%conv.3 = fpext half %mul.3 to float
|
|
%add.3 = fadd float %add.2, %conv.3
|
|
%inc.3 = add nuw i32 %i.010, 4
|
|
%niter.nsub.3 = add i32 %niter, -4
|
|
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
|
|
br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc float @half_half_acc(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
|
|
; CHECK-LABEL: half_half_acc:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: push {r4, r5, r7, lr}
|
|
; CHECK-NEXT: cbz r2, .LBB10_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: subs r3, r2, #1
|
|
; CHECK-NEXT: and r12, r2, #3
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhs .LBB10_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vldr s0, .LCPI10_0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB10_6
|
|
; CHECK-NEXT: .LBB10_3:
|
|
; CHECK-NEXT: vldr s0, .LCPI10_0
|
|
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
|
; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new
|
|
; CHECK-NEXT: bic r2, r2, #3
|
|
; CHECK-NEXT: movs r3, #1
|
|
; CHECK-NEXT: subs r2, #4
|
|
; CHECK-NEXT: vldr s0, .LCPI10_0
|
|
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: .LBB10_5: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: adds r5, r0, r3
|
|
; CHECK-NEXT: adds r4, r1, r3
|
|
; CHECK-NEXT: vldr.16 s2, [r4, #6]
|
|
; CHECK-NEXT: vldr.16 s4, [r5, #6]
|
|
; CHECK-NEXT: vldr.16 s6, [r5, #4]
|
|
; CHECK-NEXT: vldr.16 s8, [r5, #2]
|
|
; CHECK-NEXT: vadd.f16 s2, s4, s2
|
|
; CHECK-NEXT: vldr.16 s4, [r4, #4]
|
|
; CHECK-NEXT: vldr.16 s10, [r5]
|
|
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
|
|
; CHECK-NEXT: vadd.f16 s4, s6, s4
|
|
; CHECK-NEXT: vldr.16 s6, [r4, #2]
|
|
; CHECK-NEXT: vcvtb.f32.f16 s4, s4
|
|
; CHECK-NEXT: adds r3, #8
|
|
; CHECK-NEXT: vadd.f16 s6, s8, s6
|
|
; CHECK-NEXT: vldr.16 s8, [r4]
|
|
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
|
|
; CHECK-NEXT: adds r2, #4
|
|
; CHECK-NEXT: vadd.f16 s8, s10, s8
|
|
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s8
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s6
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s4
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s2
|
|
; CHECK-NEXT: le lr, .LBB10_5
|
|
; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa
|
|
; CHECK-NEXT: wls lr, r12, .LBB10_9
|
|
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #1
|
|
; CHECK-NEXT: add.w r1, r1, r2, lsl #1
|
|
; CHECK-NEXT: .LBB10_8: @ %for.body.epil
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldr.16 s2, [r1]
|
|
; CHECK-NEXT: vldr.16 s4, [r0]
|
|
; CHECK-NEXT: adds r0, #2
|
|
; CHECK-NEXT: adds r1, #2
|
|
; CHECK-NEXT: vadd.f16 s2, s4, s2
|
|
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s2
|
|
; CHECK-NEXT: le lr, .LBB10_8
|
|
; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.10:
|
|
; CHECK-NEXT: .LCPI10_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
entry:
|
|
%cmp9 = icmp eq i32 %N, 0
|
|
br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%0 = add i32 %N, -1
|
|
%xtraiter = and i32 %N, 3
|
|
%1 = icmp ult i32 %0, 3
|
|
br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
|
|
|
|
for.body.preheader.new: ; preds = %for.body.preheader
|
|
%unroll_iter = sub i32 %N, %xtraiter
|
|
br label %for.body
|
|
|
|
for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
|
|
%add2.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add2.3, %for.body ]
|
|
%i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
|
|
%res.010.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add2.3, %for.body ]
|
|
%lcmp.mod = icmp eq i32 %xtraiter, 0
|
|
br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
|
|
|
|
for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
|
|
%i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
|
|
%res.010.epil = phi float [ %add2.epil, %for.body.epil ], [ %res.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
|
|
%epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
|
|
%arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.011.epil
|
|
%2 = load half, ptr %arrayidx.epil, align 2
|
|
%arrayidx1.epil = getelementptr inbounds half, ptr %b, i32 %i.011.epil
|
|
%3 = load half, ptr %arrayidx1.epil, align 2
|
|
%add.epil = fadd half %2, %3
|
|
%conv.epil = fpext half %add.epil to float
|
|
%add2.epil = fadd float %res.010.epil, %conv.epil
|
|
%inc.epil = add nuw i32 %i.011.epil, 1
|
|
%epil.iter.sub = add i32 %epil.iter, -1
|
|
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
|
|
br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
|
|
|
|
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
|
|
%res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add2.epil, %for.body.epil ]
|
|
ret float %res.0.lcssa
|
|
|
|
for.body: ; preds = %for.body, %for.body.preheader.new
|
|
%i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
|
|
%res.010 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add2.3, %for.body ]
|
|
%niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
|
|
%arrayidx = getelementptr inbounds half, ptr %a, i32 %i.011
|
|
%4 = load half, ptr %arrayidx, align 2
|
|
%arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.011
|
|
%5 = load half, ptr %arrayidx1, align 2
|
|
%add = fadd half %4, %5
|
|
%conv = fpext half %add to float
|
|
%add2 = fadd float %res.010, %conv
|
|
%inc = or disjoint i32 %i.011, 1
|
|
%arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
|
|
%6 = load half, ptr %arrayidx.1, align 2
|
|
%arrayidx1.1 = getelementptr inbounds half, ptr %b, i32 %inc
|
|
%7 = load half, ptr %arrayidx1.1, align 2
|
|
%add.1 = fadd half %6, %7
|
|
%conv.1 = fpext half %add.1 to float
|
|
%add2.1 = fadd float %add2, %conv.1
|
|
%inc.1 = or disjoint i32 %i.011, 2
|
|
%arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
|
|
%8 = load half, ptr %arrayidx.2, align 2
|
|
%arrayidx1.2 = getelementptr inbounds half, ptr %b, i32 %inc.1
|
|
%9 = load half, ptr %arrayidx1.2, align 2
|
|
%add.2 = fadd half %8, %9
|
|
%conv.2 = fpext half %add.2 to float
|
|
%add2.2 = fadd float %add2.1, %conv.2
|
|
%inc.2 = or disjoint i32 %i.011, 3
|
|
%arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
|
|
%10 = load half, ptr %arrayidx.3, align 2
|
|
%arrayidx1.3 = getelementptr inbounds half, ptr %b, i32 %inc.2
|
|
%11 = load half, ptr %arrayidx1.3, align 2
|
|
%add.3 = fadd half %10, %11
|
|
%conv.3 = fpext half %add.3 to float
|
|
%add2.3 = fadd float %add2.2, %conv.3
|
|
%inc.3 = add nuw i32 %i.011, 4
|
|
%niter.nsub.3 = add i32 %niter, -4
|
|
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
|
|
br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
|
|
}
|
|
|
|
define arm_aapcs_vfpcc float @half_short_mac(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
|
|
; CHECK-LABEL: half_short_mac:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
|
; CHECK-NEXT: cbz r2, .LBB11_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: subs r3, r2, #1
|
|
; CHECK-NEXT: and r12, r2, #3
|
|
; CHECK-NEXT: cmp r3, #3
|
|
; CHECK-NEXT: bhs .LBB11_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vldr s0, .LCPI11_0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB11_6
|
|
; CHECK-NEXT: .LBB11_3:
|
|
; CHECK-NEXT: vldr s0, .LCPI11_0
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new
|
|
; CHECK-NEXT: bic r2, r2, #3
|
|
; CHECK-NEXT: movs r3, #1
|
|
; CHECK-NEXT: subs r2, #4
|
|
; CHECK-NEXT: vldr s0, .LCPI11_0
|
|
; CHECK-NEXT: adds r4, r0, #4
|
|
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
|
|
; CHECK-NEXT: adds r3, r1, #4
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: .LBB11_5: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldrsh.w r5, [r3, #2]
|
|
; CHECK-NEXT: vldr.16 s2, [r4, #2]
|
|
; CHECK-NEXT: adds r2, #4
|
|
; CHECK-NEXT: vmov s4, r5
|
|
; CHECK-NEXT: ldrsh r5, [r3], #8
|
|
; CHECK-NEXT: vcvt.f16.s32 s4, s4
|
|
; CHECK-NEXT: ldrsh r6, [r3, #-10]
|
|
; CHECK-NEXT: vmul.f16 s2, s2, s4
|
|
; CHECK-NEXT: vmov s6, r5
|
|
; CHECK-NEXT: vldr.16 s4, [r4]
|
|
; CHECK-NEXT: vcvt.f16.s32 s6, s6
|
|
; CHECK-NEXT: ldrsh r5, [r3, #-12]
|
|
; CHECK-NEXT: vmul.f16 s4, s4, s6
|
|
; CHECK-NEXT: vmov s8, r6
|
|
; CHECK-NEXT: vldr.16 s6, [r4, #-2]
|
|
; CHECK-NEXT: vcvt.f16.s32 s8, s8
|
|
; CHECK-NEXT: vmov s10, r5
|
|
; CHECK-NEXT: vcvtb.f32.f16 s4, s4
|
|
; CHECK-NEXT: vmul.f16 s6, s6, s8
|
|
; CHECK-NEXT: vldr.16 s8, [r4, #-4]
|
|
; CHECK-NEXT: vcvt.f16.s32 s10, s10
|
|
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
|
|
; CHECK-NEXT: vmul.f16 s8, s8, s10
|
|
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
|
|
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
|
|
; CHECK-NEXT: adds r4, #8
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s8
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s6
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s4
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s2
|
|
; CHECK-NEXT: le lr, .LBB11_5
|
|
; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
|
|
; CHECK-NEXT: wls lr, r12, .LBB11_9
|
|
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #1
|
|
; CHECK-NEXT: add.w r1, r1, r2, lsl #1
|
|
; CHECK-NEXT: .LBB11_8: @ %for.body.epil
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldrsh r2, [r1], #2
|
|
; CHECK-NEXT: vldr.16 s2, [r0]
|
|
; CHECK-NEXT: adds r0, #2
|
|
; CHECK-NEXT: vmov s4, r2
|
|
; CHECK-NEXT: vcvt.f16.s32 s4, s4
|
|
; CHECK-NEXT: vmul.f16 s2, s2, s4
|
|
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s2
|
|
; CHECK-NEXT: le lr, .LBB11_8
|
|
; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.10:
|
|
; CHECK-NEXT: .LCPI11_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
entry:
|
|
%cmp10 = icmp eq i32 %N, 0
|
|
br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%0 = add i32 %N, -1
|
|
%xtraiter = and i32 %N, 3
|
|
%1 = icmp ult i32 %0, 3
|
|
br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
|
|
|
|
for.body.preheader.new: ; preds = %for.body.preheader
|
|
%unroll_iter = sub i32 %N, %xtraiter
|
|
br label %for.body
|
|
|
|
for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
|
|
%add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
|
|
%i.012.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
|
|
%res.011.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
|
|
%lcmp.mod = icmp eq i32 %xtraiter, 0
|
|
br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
|
|
|
|
for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
|
|
%i.012.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.012.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
|
|
%res.011.epil = phi float [ %add.epil, %for.body.epil ], [ %res.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
|
|
%epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
|
|
%arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.012.epil
|
|
%2 = load half, ptr %arrayidx.epil, align 2
|
|
%arrayidx1.epil = getelementptr inbounds i16, ptr %b, i32 %i.012.epil
|
|
%3 = load i16, ptr %arrayidx1.epil, align 2
|
|
%conv2.epil = sitofp i16 %3 to half
|
|
%mul.epil = fmul half %2, %conv2.epil
|
|
%conv3.epil = fpext half %mul.epil to float
|
|
%add.epil = fadd float %res.011.epil, %conv3.epil
|
|
%inc.epil = add nuw i32 %i.012.epil, 1
|
|
%epil.iter.sub = add i32 %epil.iter, -1
|
|
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
|
|
br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
|
|
|
|
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
|
|
%res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
|
|
ret float %res.0.lcssa
|
|
|
|
for.body: ; preds = %for.body, %for.body.preheader.new
|
|
%i.012 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
|
|
%res.011 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
|
|
%niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
|
|
%arrayidx = getelementptr inbounds half, ptr %a, i32 %i.012
|
|
%4 = load half, ptr %arrayidx, align 2
|
|
%arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.012
|
|
%5 = load i16, ptr %arrayidx1, align 2
|
|
%conv2 = sitofp i16 %5 to half
|
|
%mul = fmul half %4, %conv2
|
|
%conv3 = fpext half %mul to float
|
|
%add = fadd float %res.011, %conv3
|
|
%inc = or disjoint i32 %i.012, 1
|
|
%arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
|
|
%6 = load half, ptr %arrayidx.1, align 2
|
|
%arrayidx1.1 = getelementptr inbounds i16, ptr %b, i32 %inc
|
|
%7 = load i16, ptr %arrayidx1.1, align 2
|
|
%conv2.1 = sitofp i16 %7 to half
|
|
%mul.1 = fmul half %6, %conv2.1
|
|
%conv3.1 = fpext half %mul.1 to float
|
|
%add.1 = fadd float %add, %conv3.1
|
|
%inc.1 = or disjoint i32 %i.012, 2
|
|
%arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
|
|
%8 = load half, ptr %arrayidx.2, align 2
|
|
%arrayidx1.2 = getelementptr inbounds i16, ptr %b, i32 %inc.1
|
|
%9 = load i16, ptr %arrayidx1.2, align 2
|
|
%conv2.2 = sitofp i16 %9 to half
|
|
%mul.2 = fmul half %8, %conv2.2
|
|
%conv3.2 = fpext half %mul.2 to float
|
|
%add.2 = fadd float %add.1, %conv3.2
|
|
%inc.2 = or disjoint i32 %i.012, 3
|
|
%arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
|
|
%10 = load half, ptr %arrayidx.3, align 2
|
|
%arrayidx1.3 = getelementptr inbounds i16, ptr %b, i32 %inc.2
|
|
%11 = load i16, ptr %arrayidx1.3, align 2
|
|
%conv2.3 = sitofp i16 %11 to half
|
|
%mul.3 = fmul half %10, %conv2.3
|
|
%conv3.3 = fpext half %mul.3 to float
|
|
%add.3 = fadd float %add.2, %conv3.3
|
|
%inc.3 = add nuw i32 %i.012, 4
|
|
%niter.nsub.3 = add i32 %niter, -4
|
|
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
|
|
br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
|
|
}
|
|
|