BlockFrequencyInfo calculates block frequencies as Scaled64 numbers but as a last step converts them to unsigned 64bit integers (`BlockFrequency`). This improves the factors picked for this conversion so that: * Avoid big numbers close to UINT64_MAX to avoid users overflowing/saturating when adding multiply frequencies together or when multiplying with integers. This leaves the topmost 10 bits unused to allow for some room. * Spread the difference between hottest/coldest block as much as possible to increase precision. * If the hot/cold spread cannot be represented loose precision at the lower end, but keep the frequencies at the upper end for hot blocks differentiable.
2120 lines
99 KiB
LLVM
2120 lines
99 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
|
|
|
|
define arm_aapcs_vfpcc void @test_fadd(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fadd:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB0_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r3, s0
|
|
; CHECK-NEXT: .LBB0_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r2, #4
|
|
; CHECK-NEXT: vadd.f32 q0, q0, r3
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
|
; CHECK-NEXT: bne .LBB0_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
|
|
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = fadd fast <4 x float> %wide.load, %broadcast.splat11
|
|
%i4 = getelementptr inbounds float, ptr %C, i32 %index
|
|
store <4 x float> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fadd_r(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fadd_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB1_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r3, s0
|
|
; CHECK-NEXT: .LBB1_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r2, #4
|
|
; CHECK-NEXT: vadd.f32 q0, q0, r3
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
|
; CHECK-NEXT: bne .LBB1_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
|
|
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = fadd fast <4 x float> %broadcast.splat11, %wide.load
|
|
%i4 = getelementptr inbounds float, ptr %C, i32 %index
|
|
store <4 x float> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmul(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fmul:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB2_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r3, s0
|
|
; CHECK-NEXT: .LBB2_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r2, #4
|
|
; CHECK-NEXT: vmul.f32 q0, q0, r3
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
|
; CHECK-NEXT: bne .LBB2_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
|
|
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = fmul fast <4 x float> %wide.load, %broadcast.splat11
|
|
%i4 = getelementptr inbounds float, ptr %C, i32 %index
|
|
store <4 x float> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmul_r(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fmul_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB3_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r3, s0
|
|
; CHECK-NEXT: .LBB3_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r2, #4
|
|
; CHECK-NEXT: vmul.f32 q0, q0, r3
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
|
; CHECK-NEXT: bne .LBB3_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
|
|
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = fmul fast <4 x float> %broadcast.splat11, %wide.load
|
|
%i4 = getelementptr inbounds float, ptr %C, i32 %index
|
|
store <4 x float> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fsub(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fsub:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB4_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r3, s0
|
|
; CHECK-NEXT: .LBB4_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r2, #4
|
|
; CHECK-NEXT: vsub.f32 q0, q0, r3
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
|
; CHECK-NEXT: bne .LBB4_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
|
|
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = fsub fast <4 x float> %wide.load, %broadcast.splat11
|
|
%i4 = getelementptr inbounds float, ptr %C, i32 %index
|
|
store <4 x float> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fsub_r(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fsub_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB5_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r3, s0
|
|
; CHECK-NEXT: vdup.32 q0, r3
|
|
; CHECK-NEXT: .LBB5_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: subs r2, #4
|
|
; CHECK-NEXT: vsub.f32 q1, q0, q1
|
|
; CHECK-NEXT: vstrb.8 q1, [r1], #16
|
|
; CHECK-NEXT: bne .LBB5_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
|
|
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = fsub fast <4 x float> %broadcast.splat11, %wide.load
|
|
%i4 = getelementptr inbounds float, ptr %C, i32 %index
|
|
store <4 x float> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
define arm_aapcs_vfpcc void @test_fmas(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmas:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB6_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r12, s0
|
|
; CHECK-NEXT: .LBB6_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vfmas.f32 q1, q0, r12
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
|
; CHECK-NEXT: bne .LBB6_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
|
|
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = getelementptr inbounds float, ptr %B, i32 %index
|
|
%wide.load12 = load <4 x float>, ptr %i3, align 4
|
|
%i5 = fmul fast <4 x float> %wide.load12, %wide.load
|
|
%i6 = fadd fast <4 x float> %i5, %broadcast.splat14
|
|
%i7 = getelementptr inbounds float, ptr %D, i32 %index
|
|
store <4 x float> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmas_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmas_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB7_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r12, s0
|
|
; CHECK-NEXT: .LBB7_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vfmas.f32 q1, q0, r12
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
|
; CHECK-NEXT: bne .LBB7_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
|
|
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = getelementptr inbounds float, ptr %B, i32 %index
|
|
%wide.load12 = load <4 x float>, ptr %i3, align 4
|
|
%i5 = fmul fast <4 x float> %wide.load12, %wide.load
|
|
%i6 = fadd fast <4 x float> %broadcast.splat14, %i5
|
|
%i7 = getelementptr inbounds float, ptr %D, i32 %index
|
|
store <4 x float> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fma(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fma:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB8_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r12, s0
|
|
; CHECK-NEXT: .LBB8_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vfma.f32 q1, q0, r12
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
|
; CHECK-NEXT: bne .LBB8_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
|
|
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = fmul fast <4 x float> %wide.load, %broadcast.splat13
|
|
%i4 = getelementptr inbounds float, ptr %B, i32 %index
|
|
%wide.load14 = load <4 x float>, ptr %i4, align 4
|
|
%i6 = fadd fast <4 x float> %i3, %wide.load14
|
|
%i7 = getelementptr inbounds float, ptr %D, i32 %index
|
|
store <4 x float> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fma_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fma_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB9_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r12, s0
|
|
; CHECK-NEXT: .LBB9_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vfma.f32 q1, q0, r12
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
|
; CHECK-NEXT: bne .LBB9_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
|
|
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = fmul fast <4 x float> %broadcast.splat13, %wide.load
|
|
%i4 = getelementptr inbounds float, ptr %B, i32 %index
|
|
%wide.load14 = load <4 x float>, ptr %i4, align 4
|
|
%i6 = fadd fast <4 x float> %i3, %wide.load14
|
|
%i7 = getelementptr inbounds float, ptr %D, i32 %index
|
|
store <4 x float> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
define arm_aapcs_vfpcc void @test_fmss(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmss:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB10_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r12, s0
|
|
; CHECK-NEXT: vdup.32 q0, r12
|
|
; CHECK-NEXT: vneg.f32 q0, q0
|
|
; CHECK-NEXT: .LBB10_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
|
|
; CHECK-NEXT: vmov q3, q0
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vfma.f32 q3, q2, q1
|
|
; CHECK-NEXT: vstrb.8 q3, [r2], #16
|
|
; CHECK-NEXT: bne .LBB10_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
|
|
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = getelementptr inbounds float, ptr %B, i32 %index
|
|
%wide.load12 = load <4 x float>, ptr %i3, align 4
|
|
%i5 = fmul fast <4 x float> %wide.load12, %wide.load
|
|
%i6 = fsub fast <4 x float> %i5, %broadcast.splat14
|
|
%i7 = getelementptr inbounds float, ptr %D, i32 %index
|
|
store <4 x float> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmss_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmss_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB11_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r12, s0
|
|
; CHECK-NEXT: vdup.32 q0, r12
|
|
; CHECK-NEXT: .LBB11_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
|
|
; CHECK-NEXT: vmov q3, q0
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vfms.f32 q3, q2, q1
|
|
; CHECK-NEXT: vstrb.8 q3, [r2], #16
|
|
; CHECK-NEXT: bne .LBB11_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
|
|
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = getelementptr inbounds float, ptr %B, i32 %index
|
|
%wide.load12 = load <4 x float>, ptr %i3, align 4
|
|
%i5 = fmul fast <4 x float> %wide.load12, %wide.load
|
|
%i6 = fsub fast <4 x float> %broadcast.splat14, %i5
|
|
%i7 = getelementptr inbounds float, ptr %D, i32 %index
|
|
store <4 x float> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fms(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fms:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB12_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r12, s0
|
|
; CHECK-NEXT: .LBB12_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vneg.f32 q0, q0
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r12
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: bne .LBB12_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
|
|
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = fmul fast <4 x float> %wide.load, %broadcast.splat13
|
|
%i4 = getelementptr inbounds float, ptr %B, i32 %index
|
|
%wide.load14 = load <4 x float>, ptr %i4, align 4
|
|
%i6 = fsub fast <4 x float> %i3, %wide.load14
|
|
%i7 = getelementptr inbounds float, ptr %D, i32 %index
|
|
store <4 x float> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fms_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fms_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB13_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov r12, s0
|
|
; CHECK-NEXT: .LBB13_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: vneg.f32 q0, q0
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r12
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: bne .LBB13_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
|
|
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds float, ptr %A, i32 %index
|
|
%wide.load = load <4 x float>, ptr %i1, align 4
|
|
%i3 = fmul fast <4 x float> %broadcast.splat13, %wide.load
|
|
%i4 = getelementptr inbounds float, ptr %B, i32 %index
|
|
%wide.load14 = load <4 x float>, ptr %i4, align 4
|
|
%i6 = fsub fast <4 x float> %i3, %wide.load14
|
|
%i7 = getelementptr inbounds float, ptr %D, i32 %index
|
|
store <4 x float> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
define dso_local void @test_nested(ptr noalias nocapture %pInT1, ptr noalias nocapture readonly %pOutT1, ptr noalias nocapture readonly %pPRT_in, ptr noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l) local_unnamed_addr {
|
|
; CHECK-LABEL: test_nested:
|
|
; CHECK: @ %bb.0: @ %for.body.us.preheader
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
|
; CHECK-NEXT: ldrd lr, r12, [sp, #16]
|
|
; CHECK-NEXT: lsl.w r3, r12, #2
|
|
; CHECK-NEXT: .LBB14_1: @ %for.body.us
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB14_2 Depth 2
|
|
; CHECK-NEXT: ldr r4, [r1]
|
|
; CHECK-NEXT: mov r5, r2
|
|
; CHECK-NEXT: mov r6, r12
|
|
; CHECK-NEXT: vdup.32 q0, r4
|
|
; CHECK-NEXT: mov r4, r0
|
|
; CHECK-NEXT: .LBB14_2: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5], #16
|
|
; CHECK-NEXT: vldrw.u32 q2, [r4]
|
|
; CHECK-NEXT: subs r6, #4
|
|
; CHECK-NEXT: vfms.f32 q2, q1, q0
|
|
; CHECK-NEXT: vstrb.8 q2, [r4], #16
|
|
; CHECK-NEXT: bne .LBB14_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us
|
|
; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1
|
|
; CHECK-NEXT: add r0, r3
|
|
; CHECK-NEXT: add r2, r3
|
|
; CHECK-NEXT: adds r1, #4
|
|
; CHECK-NEXT: le lr, .LBB14_1
|
|
; CHECK-NEXT: @ %bb.4: @ %for.end14
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
for.body.us.preheader:
|
|
%cmp = icmp sgt i32 %numRows, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp1 = icmp sgt i32 %numCols, 0
|
|
tail call void @llvm.assume(i1 %cmp1)
|
|
%rem = and i32 %numCols, 7
|
|
%cmp2 = icmp eq i32 %rem, 0
|
|
tail call void @llvm.assume(i1 %cmp2)
|
|
%cmp3 = icmp slt i32 %l, %numCols
|
|
tail call void @llvm.assume(i1 %cmp3)
|
|
br label %for.body.us
|
|
|
|
for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
|
|
%pInT1.addr.038.us = phi ptr [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
|
|
%i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
|
|
%pOutT1.addr.036.us = phi ptr [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
|
|
%pPRT_in.addr.035.us = phi ptr [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
|
|
%scevgep = getelementptr float, ptr %pPRT_in.addr.035.us, i32 %numCols
|
|
%i = load float, ptr %pOutT1.addr.036.us, align 4
|
|
%broadcast.splatinsert47 = insertelement <4 x float> undef, float %i, i32 0
|
|
%broadcast.splat48 = shufflevector <4 x float> %broadcast.splatinsert47, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body.us
|
|
%index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
|
|
%next.gep = getelementptr float, ptr %pInT1.addr.038.us, i32 %index
|
|
%next.gep45 = getelementptr float, ptr %pPRT_in.addr.035.us, i32 %index
|
|
%wide.load = load <4 x float>, ptr %next.gep, align 4
|
|
%wide.load46 = load <4 x float>, ptr %next.gep45, align 4
|
|
%i3 = fmul fast <4 x float> %wide.load46, %broadcast.splat48
|
|
%i4 = fsub fast <4 x float> %wide.load, %i3
|
|
store <4 x float> %i4, ptr %next.gep, align 4
|
|
%index.next = add i32 %index, 4
|
|
%i5 = icmp eq i32 %index.next, %numCols
|
|
br i1 %i5, label %for.cond6.for.end_crit_edge.us, label %vector.body
|
|
|
|
for.cond6.for.end_crit_edge.us: ; preds = %vector.body
|
|
%incdec.ptr.us = getelementptr inbounds float, ptr %pOutT1.addr.036.us, i32 1
|
|
%scevgep40 = getelementptr float, ptr %pInT1.addr.038.us, i32 %numCols
|
|
%inc13.us = add nuw nsw i32 %i.037.us, 1
|
|
%exitcond41 = icmp eq i32 %inc13.us, %numRows
|
|
br i1 %exitcond41, label %for.end14, label %for.body.us
|
|
|
|
for.end14: ; preds = %for.cond6.for.end_crit_edge.us
|
|
ret void
|
|
}
|
|
|
|
%struct.arm_fir_instance_f32 = type { i16, ptr, ptr }
|
|
define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr %pDst, i32 %blockSize) {
|
|
; CHECK-LABEL: arm_fir_f32_1_4_mve:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #8
|
|
; CHECK-NEXT: sub sp, #8
|
|
; CHECK-NEXT: ldrh.w r10, [r0]
|
|
; CHECK-NEXT: mov r11, r1
|
|
; CHECK-NEXT: ldr.w r12, [r0, #4]
|
|
; CHECK-NEXT: sub.w r1, r10, #1
|
|
; CHECK-NEXT: cmp r1, #3
|
|
; CHECK-NEXT: bhi .LBB15_6
|
|
; CHECK-NEXT: @ %bb.1: @ %if.then
|
|
; CHECK-NEXT: ldr r4, [r0, #8]
|
|
; CHECK-NEXT: ldrd r7, r6, [r4]
|
|
; CHECK-NEXT: ldrd r5, r8, [r4, #8]
|
|
; CHECK-NEXT: add.w r4, r12, r1, lsl #2
|
|
; CHECK-NEXT: lsrs r1, r3, #2
|
|
; CHECK-NEXT: wls lr, r1, .LBB15_5
|
|
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
|
|
; CHECK-NEXT: bic r1, r3, #3
|
|
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
|
|
; CHECK-NEXT: add.w r9, r12, #4
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: mov r1, r11
|
|
; CHECK-NEXT: .LBB15_3: @ %while.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
|
; CHECK-NEXT: vstrb.8 q0, [r4], #16
|
|
; CHECK-NEXT: vldrw.u32 q0, [r9, #-4]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r9], #16
|
|
; CHECK-NEXT: vmul.f32 q0, q0, r7
|
|
; CHECK-NEXT: vldrw.u32 q2, [r9, #-8]
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r6
|
|
; CHECK-NEXT: vldrw.u32 q1, [r9, #-12]
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r5
|
|
; CHECK-NEXT: vfma.f32 q0, q2, r8
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: le lr, .LBB15_3
|
|
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
|
|
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
|
|
; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r12, r12, r1, lsl #2
|
|
; CHECK-NEXT: add.w r11, r11, r1, lsl #2
|
|
; CHECK-NEXT: .LBB15_5: @ %while.end
|
|
; CHECK-NEXT: and r1, r3, #3
|
|
; CHECK-NEXT: vldrw.u32 q0, [r11]
|
|
; CHECK-NEXT: vctp.32 r1
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vstrwt.32 q0, [r4]
|
|
; CHECK-NEXT: vldrw.u32 q0, [r12]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r12, #4]
|
|
; CHECK-NEXT: vmul.f32 q0, q0, r7
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r6
|
|
; CHECK-NEXT: vldrw.u32 q1, [r12, #8]
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r5
|
|
; CHECK-NEXT: vldrw.u32 q1, [r12, #12]
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r8
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vstrwt.32 q0, [r2]
|
|
; CHECK-NEXT: ldr.w r12, [r0, #4]
|
|
; CHECK-NEXT: .LBB15_6: @ %if.end
|
|
; CHECK-NEXT: add.w r0, r12, r3, lsl #2
|
|
; CHECK-NEXT: lsr.w r1, r10, #2
|
|
; CHECK-NEXT: wls lr, r1, .LBB15_10
|
|
; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
|
|
; CHECK-NEXT: bic r2, r10, #3
|
|
; CHECK-NEXT: adds r1, r2, r3
|
|
; CHECK-NEXT: mov r3, r12
|
|
; CHECK-NEXT: add.w r1, r12, r1, lsl #2
|
|
; CHECK-NEXT: .LBB15_8: @ %while.body51
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vstrb.8 q0, [r3], #16
|
|
; CHECK-NEXT: le lr, .LBB15_8
|
|
; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit
|
|
; CHECK-NEXT: add.w r12, r12, r2, lsl #2
|
|
; CHECK-NEXT: mov r0, r1
|
|
; CHECK-NEXT: .LBB15_10: @ %while.end55
|
|
; CHECK-NEXT: ands r1, r10, #3
|
|
; CHECK-NEXT: beq .LBB15_12
|
|
; CHECK-NEXT: @ %bb.11: @ %if.then59
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
|
; CHECK-NEXT: vctp.32 r1
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vstrwt.32 q0, [r12]
|
|
; CHECK-NEXT: .LBB15_12: @ %if.end61
|
|
; CHECK-NEXT: add sp, #8
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
|
|
%i = load ptr, ptr %pState1, align 4
|
|
%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 2
|
|
%i1 = load ptr, ptr %pCoeffs2, align 4
|
|
%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 0
|
|
%i2 = load i16, ptr %numTaps3, align 4
|
|
%conv = zext i16 %i2 to i32
|
|
%sub = add nsw i32 %conv, -1
|
|
%cmp = icmp ult i32 %sub, 4
|
|
br i1 %cmp, label %if.then, label %if.end
|
|
|
|
if.then: ; preds = %entry
|
|
%arrayidx = getelementptr inbounds float, ptr %i, i32 %sub
|
|
%incdec.ptr = getelementptr inbounds float, ptr %i1, i32 1
|
|
%i3 = load float, ptr %i1, align 4
|
|
%incdec.ptr6 = getelementptr inbounds float, ptr %i1, i32 2
|
|
%i4 = load float, ptr %incdec.ptr, align 4
|
|
%incdec.ptr7 = getelementptr inbounds float, ptr %i1, i32 3
|
|
%i5 = load float, ptr %incdec.ptr6, align 4
|
|
%i6 = load float, ptr %incdec.ptr7, align 4
|
|
%shr = lshr i32 %blockSize, 2
|
|
%cmp9146 = icmp eq i32 %shr, 0
|
|
%.pre161 = insertelement <4 x float> undef, float %i3, i32 0
|
|
%.pre162 = shufflevector <4 x float> %.pre161, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%.pre163 = insertelement <4 x float> undef, float %i4, i32 0
|
|
%.pre164 = shufflevector <4 x float> %.pre163, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%.pre165 = insertelement <4 x float> undef, float %i5, i32 0
|
|
%.pre166 = shufflevector <4 x float> %.pre165, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%.pre167 = insertelement <4 x float> undef, float %i6, i32 0
|
|
%.pre168 = shufflevector <4 x float> %.pre167, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br i1 %cmp9146, label %while.end, label %while.body.lr.ph
|
|
|
|
while.body.lr.ph: ; preds = %if.then
|
|
%i7 = and i32 %blockSize, -4
|
|
%scevgep158 = getelementptr float, ptr %pDst, i32 %i7
|
|
br label %while.body
|
|
|
|
while.body: ; preds = %while.body, %while.body.lr.ph
|
|
%pStateCur.0151 = phi ptr [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
|
|
%pSamples.0150 = phi ptr [ %i, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
|
|
%pOutput.0149 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
|
|
%pTempSrc.0148 = phi ptr [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
|
|
%blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
|
|
%i9 = load <4 x float>, ptr %pTempSrc.0148, align 4
|
|
store <4 x float> %i9, ptr %pStateCur.0151, align 4
|
|
%add.ptr = getelementptr inbounds float, ptr %pStateCur.0151, i32 4
|
|
%add.ptr11 = getelementptr inbounds float, ptr %pTempSrc.0148, i32 4
|
|
%i12 = load <4 x float>, ptr %pSamples.0150, align 4
|
|
%i13 = fmul fast <4 x float> %i12, %.pre162
|
|
%arrayidx12 = getelementptr inbounds float, ptr %pSamples.0150, i32 1
|
|
%i15 = load <4 x float>, ptr %arrayidx12, align 4
|
|
%mul = fmul fast <4 x float> %i15, %.pre164
|
|
%add = fadd fast <4 x float> %mul, %i13
|
|
%arrayidx13 = getelementptr inbounds float, ptr %pSamples.0150, i32 2
|
|
%i17 = load <4 x float>, ptr %arrayidx13, align 4
|
|
%mul16 = fmul fast <4 x float> %i17, %.pre166
|
|
%add17 = fadd fast <4 x float> %add, %mul16
|
|
%arrayidx18 = getelementptr inbounds float, ptr %pSamples.0150, i32 3
|
|
%i19 = load <4 x float>, ptr %arrayidx18, align 4
|
|
%mul21 = fmul fast <4 x float> %i19, %.pre168
|
|
%add22 = fadd fast <4 x float> %add17, %mul21
|
|
store <4 x float> %add22, ptr %pOutput.0149, align 4
|
|
%add.ptr23 = getelementptr inbounds float, ptr %pOutput.0149, i32 4
|
|
%add.ptr24 = getelementptr inbounds float, ptr %pSamples.0150, i32 4
|
|
%dec = add nsw i32 %blkCnt.0147, -1
|
|
%cmp9 = icmp eq i32 %dec, 0
|
|
br i1 %cmp9, label %while.end.loopexit, label %while.body
|
|
|
|
while.end.loopexit: ; preds = %while.body
|
|
%scevgep157 = getelementptr float, ptr %pSrc, i32 %i7
|
|
%scevgep159 = getelementptr float, ptr %i, i32 %i7
|
|
br label %while.end
|
|
|
|
while.end: ; preds = %while.end.loopexit, %if.then
|
|
%pTempSrc.0.lcssa = phi ptr [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
|
|
%pOutput.0.lcssa = phi ptr [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
|
|
%pSamples.0.lcssa = phi ptr [ %scevgep159, %while.end.loopexit ], [ %i, %if.then ]
|
|
%pStateCur.0.lcssa = phi ptr [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
|
|
%and = and i32 %blockSize, 3
|
|
%i21 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and)
|
|
%i23 = load <4 x float>, ptr %pTempSrc.0.lcssa, align 4
|
|
tail call void @llvm.masked.store.v4f32.p0(<4 x float> %i23, ptr %pStateCur.0.lcssa, i32 4, <4 x i1> %i21)
|
|
%i26 = load <4 x float>, ptr %pSamples.0.lcssa, align 4
|
|
%i27 = fmul fast <4 x float> %i26, %.pre162
|
|
%arrayidx29 = getelementptr inbounds float, ptr %pSamples.0.lcssa, i32 1
|
|
%i29 = load <4 x float>, ptr %arrayidx29, align 4
|
|
%mul32 = fmul fast <4 x float> %i29, %.pre164
|
|
%add33 = fadd fast <4 x float> %mul32, %i27
|
|
%arrayidx34 = getelementptr inbounds float, ptr %pSamples.0.lcssa, i32 2
|
|
%i31 = load <4 x float>, ptr %arrayidx34, align 4
|
|
%mul37 = fmul fast <4 x float> %i31, %.pre166
|
|
%add38 = fadd fast <4 x float> %add33, %mul37
|
|
%arrayidx39 = getelementptr inbounds float, ptr %pSamples.0.lcssa, i32 3
|
|
%i33 = load <4 x float>, ptr %arrayidx39, align 4
|
|
%mul42 = fmul fast <4 x float> %i33, %.pre168
|
|
%add43 = fadd fast <4 x float> %add38, %mul42
|
|
tail call void @llvm.masked.store.v4f32.p0(<4 x float> %add43, ptr %pOutput.0.lcssa, i32 4, <4 x i1> %i21)
|
|
%.pre = load ptr, ptr %pState1, align 4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %while.end, %entry
|
|
%i35 = phi ptr [ %.pre, %while.end ], [ %i, %entry ]
|
|
%arrayidx45 = getelementptr inbounds float, ptr %i35, i32 %blockSize
|
|
%shr47 = lshr i32 %conv, 2
|
|
%cmp49141 = icmp eq i32 %shr47, 0
|
|
br i1 %cmp49141, label %while.end55, label %while.body51.preheader
|
|
|
|
while.body51.preheader: ; preds = %if.end
|
|
%i36 = and i32 %conv, 65532
|
|
%i37 = add i32 %i36, %blockSize
|
|
%scevgep = getelementptr float, ptr %i35, i32 %i37
|
|
br label %while.body51
|
|
|
|
while.body51: ; preds = %while.body51, %while.body51.preheader
|
|
%pTempSrc.1144 = phi ptr [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
|
|
%pTempDest.0143 = phi ptr [ %add.ptr53, %while.body51 ], [ %i35, %while.body51.preheader ]
|
|
%blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
|
|
%i39 = load <4 x float>, ptr %pTempSrc.1144, align 4
|
|
store <4 x float> %i39, ptr %pTempDest.0143, align 4
|
|
%add.ptr52 = getelementptr inbounds float, ptr %pTempSrc.1144, i32 4
|
|
%add.ptr53 = getelementptr inbounds float, ptr %pTempDest.0143, i32 4
|
|
%dec54 = add nsw i32 %blkCnt.1142, -1
|
|
%cmp49 = icmp eq i32 %dec54, 0
|
|
br i1 %cmp49, label %while.end55.loopexit, label %while.body51
|
|
|
|
while.end55.loopexit: ; preds = %while.body51
|
|
%scevgep156 = getelementptr float, ptr %i35, i32 %i36
|
|
br label %while.end55
|
|
|
|
while.end55: ; preds = %while.end55.loopexit, %if.end
|
|
%pTempDest.0.lcssa = phi ptr [ %i35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
|
|
%pTempSrc.1.lcssa = phi ptr [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
|
|
%and56 = and i32 %conv, 3
|
|
%cmp57 = icmp eq i32 %and56, 0
|
|
br i1 %cmp57, label %if.end61, label %if.then59
|
|
|
|
if.then59: ; preds = %while.end55
|
|
%i41 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and56)
|
|
%i43 = load <4 x float>, ptr %pTempSrc.1.lcssa, align 4
|
|
tail call void @llvm.masked.store.v4f32.p0(<4 x float> %i43, ptr %pTempDest.0.lcssa, i32 4, <4 x i1> %i41)
|
|
br label %if.end61
|
|
|
|
if.end61: ; preds = %if.then59, %while.end55
|
|
ret void
|
|
}
|
|
|
|
|
|
define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
|
|
; CHECK-LABEL: fir:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #8
|
|
; CHECK-NEXT: blo.w .LBB16_13
|
|
; CHECK-NEXT: @ %bb.1: @ %if.then
|
|
; CHECK-NEXT: lsrs.w r12, r3, #2
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: bxeq lr
|
|
; CHECK-NEXT: .LBB16_2: @ %while.body.lr.ph
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #4
|
|
; CHECK-NEXT: sub sp, #4
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
|
|
; CHECK-NEXT: .pad #32
|
|
; CHECK-NEXT: sub sp, #32
|
|
; CHECK-NEXT: ldrh r6, [r0]
|
|
; CHECK-NEXT: movs r5, #1
|
|
; CHECK-NEXT: ldrd r4, r10, [r0, #4]
|
|
; CHECK-NEXT: sub.w r0, r6, #8
|
|
; CHECK-NEXT: add.w r3, r0, r0, lsr #29
|
|
; CHECK-NEXT: and r0, r0, #7
|
|
; CHECK-NEXT: asrs r7, r3, #3
|
|
; CHECK-NEXT: cmp r7, #1
|
|
; CHECK-NEXT: it gt
|
|
; CHECK-NEXT: asrgt r5, r3, #3
|
|
; CHECK-NEXT: add.w r3, r4, r6, lsl #2
|
|
; CHECK-NEXT: sub.w r9, r3, #4
|
|
; CHECK-NEXT: rsbs r3, r6, #0
|
|
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
|
|
; CHECK-NEXT: add.w r3, r10, #32
|
|
; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill
|
|
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
|
|
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
|
|
; CHECK-NEXT: b .LBB16_6
|
|
; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r4, r4, r0, lsl #2
|
|
; CHECK-NEXT: b .LBB16_5
|
|
; CHECK-NEXT: .LBB16_4: @ %for.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
|
|
; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
|
|
; CHECK-NEXT: wls lr, r0, .LBB16_5
|
|
; CHECK-NEXT: b .LBB16_10
|
|
; CHECK-NEXT: .LBB16_5: @ %while.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
|
|
; CHECK-NEXT: subs.w r12, r12, #1
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: add.w r0, r4, r0, lsl #2
|
|
; CHECK-NEXT: add.w r4, r0, #16
|
|
; CHECK-NEXT: beq .LBB16_12
|
|
; CHECK-NEXT: .LBB16_6: @ %while.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB16_8 Depth 2
|
|
; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
|
|
; CHECK-NEXT: add.w lr, r10, #8
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
|
; CHECK-NEXT: ldrd r3, r7, [r10]
|
|
; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr}
|
|
; CHECK-NEXT: ldrd r11, r8, [r10, #24]
|
|
; CHECK-NEXT: vstrb.8 q0, [r9], #16
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4], #32
|
|
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
|
|
; CHECK-NEXT: str.w r9, [sp, #24] @ 4-byte Spill
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
|
|
; CHECK-NEXT: vmul.f32 q0, q0, r3
|
|
; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
|
|
; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r7
|
|
; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
|
|
; CHECK-NEXT: vfma.f32 q0, q6, r0
|
|
; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
|
|
; CHECK-NEXT: vfma.f32 q0, q4, r5
|
|
; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
|
|
; CHECK-NEXT: vfma.f32 q0, q5, r6
|
|
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
|
|
; CHECK-NEXT: vfma.f32 q0, q2, lr
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
|
|
; CHECK-NEXT: vfma.f32 q0, q3, r11
|
|
; CHECK-NEXT: cmp r0, #16
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r8
|
|
; CHECK-NEXT: blo .LBB16_9
|
|
; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: dls lr, r0
|
|
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: .LBB16_8: @ %for.body
|
|
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11}
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4], #32
|
|
; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
|
|
; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r0
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
|
|
; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
|
|
; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r3
|
|
; CHECK-NEXT: ldrd r9, r1, [r7, #24]
|
|
; CHECK-NEXT: vfma.f32 q0, q6, r5
|
|
; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
|
|
; CHECK-NEXT: vfma.f32 q0, q4, r6
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
|
|
; CHECK-NEXT: vfma.f32 q0, q5, r8
|
|
; CHECK-NEXT: adds r7, #32
|
|
; CHECK-NEXT: vfma.f32 q0, q2, r11
|
|
; CHECK-NEXT: vfma.f32 q0, q3, r9
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r1
|
|
; CHECK-NEXT: le lr, .LBB16_8
|
|
; CHECK-NEXT: b .LBB16_4
|
|
; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: b .LBB16_4
|
|
; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: mov r3, r4
|
|
; CHECK-NEXT: .LBB16_11: @ %while.body76
|
|
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: ldr r0, [r7], #4
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3], #4
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r0
|
|
; CHECK-NEXT: le lr, .LBB16_11
|
|
; CHECK-NEXT: b .LBB16_3
|
|
; CHECK-NEXT: .LBB16_12:
|
|
; CHECK-NEXT: add sp, #32
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
|
|
; CHECK-NEXT: add sp, #4
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .LBB16_13: @ %if.end
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
|
|
%i = load ptr, ptr %pState1, align 4
|
|
%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 2
|
|
%i1 = load ptr, ptr %pCoeffs2, align 4
|
|
%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 0
|
|
%i2 = load i16, ptr %numTaps3, align 4
|
|
%conv = zext i16 %i2 to i32
|
|
%cmp = icmp ugt i32 %blockSize, 7
|
|
br i1 %cmp, label %if.then, label %if.end
|
|
|
|
if.then: ; preds = %entry
|
|
%shr = lshr i32 %blockSize, 2
|
|
%cmp5217 = icmp eq i32 %shr, 0
|
|
br i1 %cmp5217, label %if.end, label %while.body.lr.ph
|
|
|
|
while.body.lr.ph: ; preds = %if.then
|
|
%sub = add nsw i32 %conv, -1
|
|
%arrayidx = getelementptr inbounds float, ptr %i, i32 %sub
|
|
%incdec.ptr = getelementptr inbounds float, ptr %i1, i32 1
|
|
%incdec.ptr7 = getelementptr inbounds float, ptr %i1, i32 2
|
|
%incdec.ptr8 = getelementptr inbounds float, ptr %i1, i32 3
|
|
%incdec.ptr9 = getelementptr inbounds float, ptr %i1, i32 4
|
|
%incdec.ptr10 = getelementptr inbounds float, ptr %i1, i32 5
|
|
%incdec.ptr11 = getelementptr inbounds float, ptr %i1, i32 6
|
|
%incdec.ptr12 = getelementptr inbounds float, ptr %i1, i32 7
|
|
%sub37 = add nsw i32 %conv, -8
|
|
%div = sdiv i32 %sub37, 8
|
|
%pCoeffsCur.0199 = getelementptr inbounds float, ptr %i1, i32 8
|
|
%cmp38201 = icmp ugt i16 %i2, 15
|
|
%and = and i32 %sub37, 7
|
|
%cmp74210 = icmp eq i32 %and, 0
|
|
%idx.neg = sub nsw i32 0, %conv
|
|
%i3 = icmp sgt i32 %div, 1
|
|
%smax = select i1 %i3, i32 %div, i32 1
|
|
br label %while.body
|
|
|
|
while.body: ; preds = %while.end, %while.body.lr.ph
|
|
%blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
|
|
%pStateCur.0221 = phi ptr [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
|
|
%pSamples.0220 = phi ptr [ %i, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
|
|
%pTempSrc.0219 = phi ptr [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
|
|
%pOutput.0218 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
|
|
%i4 = load float, ptr %i1, align 4
|
|
%i5 = load float, ptr %incdec.ptr, align 4
|
|
%i6 = load float, ptr %incdec.ptr7, align 4
|
|
%i7 = load float, ptr %incdec.ptr8, align 4
|
|
%i8 = load float, ptr %incdec.ptr9, align 4
|
|
%i9 = load float, ptr %incdec.ptr10, align 4
|
|
%i10 = load float, ptr %incdec.ptr11, align 4
|
|
%i11 = load float, ptr %incdec.ptr12, align 4
|
|
%i13 = load <4 x float>, ptr %pTempSrc.0219, align 4
|
|
store <4 x float> %i13, ptr %pStateCur.0221, align 4
|
|
%add.ptr = getelementptr inbounds float, ptr %pStateCur.0221, i32 4
|
|
%add.ptr14 = getelementptr inbounds float, ptr %pTempSrc.0219, i32 4
|
|
%i16 = load <4 x float>, ptr %pSamples.0220, align 4
|
|
%.splatinsert = insertelement <4 x float> undef, float %i4, i32 0
|
|
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i17 = fmul fast <4 x float> %i16, %.splat
|
|
%arrayidx15 = getelementptr inbounds float, ptr %pSamples.0220, i32 1
|
|
%i19 = load <4 x float>, ptr %arrayidx15, align 4
|
|
%.splatinsert16 = insertelement <4 x float> undef, float %i5, i32 0
|
|
%.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i20 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i19, <4 x float> %.splat17, <4 x float> %i17)
|
|
%arrayidx18 = getelementptr inbounds float, ptr %pSamples.0220, i32 2
|
|
%i22 = load <4 x float>, ptr %arrayidx18, align 4
|
|
%.splatinsert19 = insertelement <4 x float> undef, float %i6, i32 0
|
|
%.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i23 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i22, <4 x float> %.splat20, <4 x float> %i20)
|
|
%arrayidx21 = getelementptr inbounds float, ptr %pSamples.0220, i32 3
|
|
%i25 = load <4 x float>, ptr %arrayidx21, align 4
|
|
%.splatinsert22 = insertelement <4 x float> undef, float %i7, i32 0
|
|
%.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i25, <4 x float> %.splat23, <4 x float> %i23)
|
|
%arrayidx24 = getelementptr inbounds float, ptr %pSamples.0220, i32 4
|
|
%i28 = load <4 x float>, ptr %arrayidx24, align 4
|
|
%.splatinsert25 = insertelement <4 x float> undef, float %i8, i32 0
|
|
%.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i29 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i28, <4 x float> %.splat26, <4 x float> %i26)
|
|
%arrayidx27 = getelementptr inbounds float, ptr %pSamples.0220, i32 5
|
|
%i31 = load <4 x float>, ptr %arrayidx27, align 4
|
|
%.splatinsert28 = insertelement <4 x float> undef, float %i9, i32 0
|
|
%.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i31, <4 x float> %.splat29, <4 x float> %i29)
|
|
%arrayidx30 = getelementptr inbounds float, ptr %pSamples.0220, i32 6
|
|
%i34 = load <4 x float>, ptr %arrayidx30, align 4
|
|
%.splatinsert31 = insertelement <4 x float> undef, float %i10, i32 0
|
|
%.splat32 = shufflevector <4 x float> %.splatinsert31, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i35 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i34, <4 x float> %.splat32, <4 x float> %i32)
|
|
%arrayidx33 = getelementptr inbounds float, ptr %pSamples.0220, i32 7
|
|
%i37 = load <4 x float>, ptr %arrayidx33, align 4
|
|
%.splatinsert34 = insertelement <4 x float> undef, float %i11, i32 0
|
|
%.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i38 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i37, <4 x float> %.splat35, <4 x float> %i35)
|
|
%pSamples.1200 = getelementptr inbounds float, ptr %pSamples.0220, i32 8
|
|
br i1 %cmp38201, label %for.body, label %for.end
|
|
|
|
for.body: ; preds = %for.body, %while.body
|
|
%pSamples.1207 = phi ptr [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
|
|
%pCoeffsCur.0206 = phi ptr [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
|
|
%.pn205 = phi ptr [ %pCoeffsCur.0206, %for.body ], [ %i1, %while.body ]
|
|
%i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
|
|
%vecAcc0.0203 = phi <4 x float> [ %i70, %for.body ], [ %i38, %while.body ]
|
|
%pSamples.0.pn202 = phi ptr [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
|
|
%incdec.ptr40 = getelementptr inbounds float, ptr %.pn205, i32 9
|
|
%i39 = load float, ptr %pCoeffsCur.0206, align 4
|
|
%incdec.ptr41 = getelementptr inbounds float, ptr %.pn205, i32 10
|
|
%i40 = load float, ptr %incdec.ptr40, align 4
|
|
%incdec.ptr42 = getelementptr inbounds float, ptr %.pn205, i32 11
|
|
%i41 = load float, ptr %incdec.ptr41, align 4
|
|
%incdec.ptr43 = getelementptr inbounds float, ptr %.pn205, i32 12
|
|
%i42 = load float, ptr %incdec.ptr42, align 4
|
|
%incdec.ptr44 = getelementptr inbounds float, ptr %.pn205, i32 13
|
|
%i43 = load float, ptr %incdec.ptr43, align 4
|
|
%incdec.ptr45 = getelementptr inbounds float, ptr %.pn205, i32 14
|
|
%i44 = load float, ptr %incdec.ptr44, align 4
|
|
%incdec.ptr46 = getelementptr inbounds float, ptr %.pn205, i32 15
|
|
%i45 = load float, ptr %incdec.ptr45, align 4
|
|
%i46 = load float, ptr %incdec.ptr46, align 4
|
|
%i48 = load <4 x float>, ptr %pSamples.1207, align 4
|
|
%.splatinsert48 = insertelement <4 x float> undef, float %i39, i32 0
|
|
%.splat49 = shufflevector <4 x float> %.splatinsert48, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i49 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i48, <4 x float> %.splat49, <4 x float> %vecAcc0.0203)
|
|
%arrayidx50 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 9
|
|
%i51 = load <4 x float>, ptr %arrayidx50, align 4
|
|
%.splatinsert51 = insertelement <4 x float> undef, float %i40, i32 0
|
|
%.splat52 = shufflevector <4 x float> %.splatinsert51, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i52 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i51, <4 x float> %.splat52, <4 x float> %i49)
|
|
%arrayidx53 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 10
|
|
%i54 = load <4 x float>, ptr %arrayidx53, align 4
|
|
%.splatinsert54 = insertelement <4 x float> undef, float %i41, i32 0
|
|
%.splat55 = shufflevector <4 x float> %.splatinsert54, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i55 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i54, <4 x float> %.splat55, <4 x float> %i52)
|
|
%arrayidx56 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 11
|
|
%i57 = load <4 x float>, ptr %arrayidx56, align 4
|
|
%.splatinsert57 = insertelement <4 x float> undef, float %i42, i32 0
|
|
%.splat58 = shufflevector <4 x float> %.splatinsert57, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i58 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i57, <4 x float> %.splat58, <4 x float> %i55)
|
|
%arrayidx59 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 12
|
|
%i60 = load <4 x float>, ptr %arrayidx59, align 4
|
|
%.splatinsert60 = insertelement <4 x float> undef, float %i43, i32 0
|
|
%.splat61 = shufflevector <4 x float> %.splatinsert60, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i61 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i60, <4 x float> %.splat61, <4 x float> %i58)
|
|
%arrayidx62 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 13
|
|
%i63 = load <4 x float>, ptr %arrayidx62, align 4
|
|
%.splatinsert63 = insertelement <4 x float> undef, float %i44, i32 0
|
|
%.splat64 = shufflevector <4 x float> %.splatinsert63, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i64 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i63, <4 x float> %.splat64, <4 x float> %i61)
|
|
%arrayidx65 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 14
|
|
%i66 = load <4 x float>, ptr %arrayidx65, align 4
|
|
%.splatinsert66 = insertelement <4 x float> undef, float %i45, i32 0
|
|
%.splat67 = shufflevector <4 x float> %.splatinsert66, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i67 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i66, <4 x float> %.splat67, <4 x float> %i64)
|
|
%arrayidx68 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 15
|
|
%i69 = load <4 x float>, ptr %arrayidx68, align 4
|
|
%.splatinsert69 = insertelement <4 x float> undef, float %i46, i32 0
|
|
%.splat70 = shufflevector <4 x float> %.splatinsert69, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i70 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i69, <4 x float> %.splat70, <4 x float> %i67)
|
|
%inc = add nuw nsw i32 %i.0204, 1
|
|
%pCoeffsCur.0 = getelementptr inbounds float, ptr %pCoeffsCur.0206, i32 8
|
|
%pSamples.1 = getelementptr inbounds float, ptr %pSamples.1207, i32 8
|
|
%exitcond = icmp eq i32 %inc, %smax
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %while.body
|
|
%vecAcc0.0.lcssa = phi <4 x float> [ %i38, %while.body ], [ %i70, %for.body ]
|
|
%pCoeffsCur.0.lcssa = phi ptr [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
|
|
%pSamples.1.lcssa = phi ptr [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
|
|
br i1 %cmp74210, label %while.end, label %while.body76
|
|
|
|
while.body76: ; preds = %while.body76, %for.end
|
|
%pCoeffsCur.1214 = phi ptr [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
|
|
%vecAcc0.1213 = phi <4 x float> [ %i74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
|
|
%numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
|
|
%pSamples.2211 = phi ptr [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
|
|
%incdec.ptr77 = getelementptr inbounds float, ptr %pCoeffsCur.1214, i32 1
|
|
%i71 = load float, ptr %pCoeffsCur.1214, align 4
|
|
%i73 = load <4 x float>, ptr %pSamples.2211, align 4
|
|
%.splatinsert78 = insertelement <4 x float> undef, float %i71, i32 0
|
|
%.splat79 = shufflevector <4 x float> %.splatinsert78, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i74 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i73, <4 x float> %.splat79, <4 x float> %vecAcc0.1213)
|
|
%incdec.ptr80 = getelementptr inbounds float, ptr %pSamples.2211, i32 1
|
|
%dec = add nsw i32 %numCnt.0212, -1
|
|
%cmp74 = icmp sgt i32 %numCnt.0212, 1
|
|
br i1 %cmp74, label %while.body76, label %while.end.loopexit
|
|
|
|
while.end.loopexit: ; preds = %while.body76
|
|
%scevgep = getelementptr float, ptr %pSamples.1.lcssa, i32 %and
|
|
br label %while.end
|
|
|
|
while.end: ; preds = %while.end.loopexit, %for.end
|
|
%pSamples.2.lcssa = phi ptr [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
|
|
%vecAcc0.1.lcssa = phi <4 x float> [ %vecAcc0.0.lcssa, %for.end ], [ %i74, %while.end.loopexit ]
|
|
store <4 x float> %vecAcc0.1.lcssa, ptr %pOutput.0218, align 4
|
|
%add.ptr81 = getelementptr inbounds float, ptr %pOutput.0218, i32 4
|
|
%add.ptr82 = getelementptr inbounds float, ptr %pSamples.2.lcssa, i32 4
|
|
%add.ptr83 = getelementptr inbounds float, ptr %add.ptr82, i32 %idx.neg
|
|
%dec84 = add nsw i32 %blkCnt.0222, -1
|
|
%cmp5 = icmp eq i32 %dec84, 0
|
|
br i1 %cmp5, label %if.end, label %while.body
|
|
|
|
if.end: ; preds = %while.end, %if.then, %entry
|
|
ret void
|
|
}
|
|
|
|
%struct.arm_biquad_cascade_stereo_df2T_instance_f32 = type { i8, ptr, ptr }
|
|
define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(ptr nocapture readonly %arg, ptr %arg1, ptr %arg2, i32 %arg3) {
|
|
; CHECK-LABEL: arm_biquad_cascade_stereo_df2T_f32:
|
|
; CHECK: @ %bb.0: @ %bb
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11}
|
|
; CHECK-NEXT: .pad #24
|
|
; CHECK-NEXT: sub sp, #24
|
|
; CHECK-NEXT: mov r8, r3
|
|
; CHECK-NEXT: ldrb.w r12, [r0]
|
|
; CHECK-NEXT: ldrd r0, r3, [r0, #4]
|
|
; CHECK-NEXT: movs r4, #0
|
|
; CHECK-NEXT: cmp.w r8, #0
|
|
; CHECK-NEXT: strd r4, r4, [sp, #16]
|
|
; CHECK-NEXT: beq .LBB17_5
|
|
; CHECK-NEXT: @ %bb.1:
|
|
; CHECK-NEXT: movs r5, #2
|
|
; CHECK-NEXT: viwdup.u32 q0, r4, r5, #1
|
|
; CHECK-NEXT: mov r4, sp
|
|
; CHECK-NEXT: .LBB17_2: @ %bb29
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB17_3 Depth 2
|
|
; CHECK-NEXT: ldrd r5, r7, [r3]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: ldr r6, [r3, #12]
|
|
; CHECK-NEXT: vldr s8, [r3, #8]
|
|
; CHECK-NEXT: vstrw.32 q1, [r4]
|
|
; CHECK-NEXT: vdup.32 q1, r7
|
|
; CHECK-NEXT: vldr s12, [r3, #16]
|
|
; CHECK-NEXT: vmov.f32 s6, s8
|
|
; CHECK-NEXT: dls lr, r8
|
|
; CHECK-NEXT: vmov.f32 s7, s8
|
|
; CHECK-NEXT: vdup.32 q2, r6
|
|
; CHECK-NEXT: vmov.f32 s10, s12
|
|
; CHECK-NEXT: mov r7, r2
|
|
; CHECK-NEXT: vmov.f32 s11, s12
|
|
; CHECK-NEXT: .LBB17_3: @ %bb55
|
|
; CHECK-NEXT: @ Parent Loop BB17_2 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: vldrw.u32 q4, [r1, q0, uxtw #2]
|
|
; CHECK-NEXT: vldrw.u32 q5, [r4, q0, uxtw #2]
|
|
; CHECK-NEXT: vldrw.u32 q3, [sp, #8]
|
|
; CHECK-NEXT: adds r1, #8
|
|
; CHECK-NEXT: vfma.f32 q5, q4, r5
|
|
; CHECK-NEXT: vfma.f32 q3, q5, q2
|
|
; CHECK-NEXT: vstmia r7!, {s20, s21}
|
|
; CHECK-NEXT: vfma.f32 q3, q4, q1
|
|
; CHECK-NEXT: vstrw.32 q3, [r4]
|
|
; CHECK-NEXT: le lr, .LBB17_3
|
|
; CHECK-NEXT: @ %bb.4: @ %bb75
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_2 Depth=1
|
|
; CHECK-NEXT: subs.w r12, r12, #1
|
|
; CHECK-NEXT: add.w r3, r3, #20
|
|
; CHECK-NEXT: vstrb.8 q3, [r0], #16
|
|
; CHECK-NEXT: mov r1, r2
|
|
; CHECK-NEXT: bne .LBB17_2
|
|
; CHECK-NEXT: b .LBB17_7
|
|
; CHECK-NEXT: .LBB17_5: @ %bb21.preheader
|
|
; CHECK-NEXT: dls lr, r12
|
|
; CHECK-NEXT: mov r1, sp
|
|
; CHECK-NEXT: .LBB17_6: @ %bb21
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
|
; CHECK-NEXT: le lr, .LBB17_6
|
|
; CHECK-NEXT: .LBB17_7: @ %bb80
|
|
; CHECK-NEXT: add sp, #24
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11}
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
|
|
bb:
|
|
%i = alloca [6 x float], align 4
|
|
%i4 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, ptr %arg, i32 0, i32 1
|
|
%i5 = load ptr, ptr %i4, align 4
|
|
%i6 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, ptr %arg, i32 0, i32 2
|
|
%i7 = load ptr, ptr %i6, align 4
|
|
%i8 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, ptr %arg, i32 0, i32 0
|
|
%i9 = load i8, ptr %i8, align 4
|
|
%i10 = zext i8 %i9 to i32
|
|
call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %i)
|
|
%i12 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 0, i32 2, i32 1)
|
|
%i13 = extractvalue { <4 x i32>, i32 } %i12, 0
|
|
%i14 = getelementptr inbounds [6 x float], ptr %i, i32 0, i32 4
|
|
store float 0.000000e+00, ptr %i14, align 4
|
|
%i15 = getelementptr inbounds [6 x float], ptr %i, i32 0, i32 5
|
|
store float 0.000000e+00, ptr %i15, align 4
|
|
%i17 = icmp eq i32 %arg3, 0
|
|
%i19 = getelementptr inbounds [6 x float], ptr %i, i32 0, i32 2
|
|
br i1 %i17, label %bb21, label %bb29
|
|
|
|
bb21: ; preds = %bb21, %bb
|
|
%i22 = phi i32 [ %i27, %bb21 ], [ %i10, %bb ]
|
|
%i23 = phi ptr [ %i26, %bb21 ], [ %i5, %bb ]
|
|
%i25 = load <4 x float>, ptr %i23, align 8
|
|
store <4 x float> %i25, ptr %i, align 4
|
|
%i26 = getelementptr inbounds float, ptr %i23, i32 4
|
|
%i27 = add i32 %i22, -1
|
|
%i28 = icmp eq i32 %i27, 0
|
|
br i1 %i28, label %bb80, label %bb21
|
|
|
|
bb29: ; preds = %bb75, %bb
|
|
%i30 = phi i32 [ %i78, %bb75 ], [ %i10, %bb ]
|
|
%i31 = phi ptr [ %i76, %bb75 ], [ %i7, %bb ]
|
|
%i32 = phi ptr [ %i77, %bb75 ], [ %i5, %bb ]
|
|
%i33 = phi ptr [ %arg2, %bb75 ], [ %arg1, %bb ]
|
|
%i34 = getelementptr inbounds float, ptr %i31, i32 1
|
|
%i35 = load float, ptr %i31, align 4
|
|
%i36 = getelementptr inbounds float, ptr %i31, i32 2
|
|
%i37 = load float, ptr %i34, align 4
|
|
%i38 = getelementptr inbounds float, ptr %i31, i32 3
|
|
%i39 = load float, ptr %i36, align 4
|
|
%i40 = getelementptr inbounds float, ptr %i31, i32 4
|
|
%i41 = load float, ptr %i38, align 4
|
|
%i42 = load float, ptr %i40, align 4
|
|
%i43 = insertelement <4 x float> undef, float %i41, i32 0
|
|
%i44 = shufflevector <4 x float> %i43, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
|
|
%i45 = insertelement <4 x float> %i44, float %i42, i32 2
|
|
%i46 = insertelement <4 x float> %i45, float %i42, i32 3
|
|
%i47 = insertelement <4 x float> undef, float %i37, i32 0
|
|
%i48 = shufflevector <4 x float> %i47, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
|
|
%i49 = insertelement <4 x float> %i48, float %i39, i32 2
|
|
%i50 = insertelement <4 x float> %i49, float %i39, i32 3
|
|
%i52 = load <4 x float>, ptr %i32, align 8
|
|
store <4 x float> %i52, ptr %i, align 4
|
|
%i53 = insertelement <4 x float> undef, float %i35, i32 0
|
|
%i54 = shufflevector <4 x float> %i53, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %bb55
|
|
|
|
bb55: ; preds = %bb55, %bb29
|
|
%i56 = phi ptr [ %i33, %bb29 ], [ %i72, %bb55 ]
|
|
%i57 = phi ptr [ %arg2, %bb29 ], [ %i68, %bb55 ]
|
|
%i58 = phi i32 [ %arg3, %bb29 ], [ %i73, %bb55 ]
|
|
%i59 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr nonnull %i, <4 x i32> %i13, i32 32, i32 2, i32 1)
|
|
%i60 = bitcast <4 x i32> %i59 to <4 x float>
|
|
%i62 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %i56, <4 x i32> %i13, i32 32, i32 2, i32 1)
|
|
%i63 = bitcast <4 x i32> %i62 to <4 x float>
|
|
%i64 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i63, <4 x float> %i54, <4 x float> %i60)
|
|
%i65 = extractelement <4 x float> %i64, i32 0
|
|
%i66 = getelementptr inbounds float, ptr %i57, i32 1
|
|
store float %i65, ptr %i57, align 4
|
|
%i67 = extractelement <4 x float> %i64, i32 1
|
|
%i68 = getelementptr inbounds float, ptr %i57, i32 2
|
|
store float %i67, ptr %i66, align 4
|
|
%i69 = load <4 x float>, ptr %i19, align 4
|
|
%i70 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i64, <4 x float> %i46, <4 x float> %i69)
|
|
%i71 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i63, <4 x float> %i50, <4 x float> %i70)
|
|
store <4 x float> %i71, ptr %i, align 4
|
|
%i72 = getelementptr inbounds float, ptr %i56, i32 2
|
|
%i73 = add i32 %i58, -1
|
|
%i74 = icmp eq i32 %i73, 0
|
|
br i1 %i74, label %bb75, label %bb55
|
|
|
|
bb75: ; preds = %bb55
|
|
%i76 = getelementptr inbounds float, ptr %i31, i32 5
|
|
store <4 x float> %i71, ptr %i32, align 4
|
|
%i77 = getelementptr inbounds float, ptr %i32, i32 4
|
|
%i78 = add i32 %i30, -1
|
|
%i79 = icmp eq i32 %i78, 0
|
|
br i1 %i79, label %bb80, label %bb29
|
|
|
|
bb80: ; preds = %bb75, %bb21
|
|
call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %i)
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @fms(ptr nocapture readonly %pSrc1, ptr nocapture readonly %pSrc2, ptr nocapture readonly %pSrc3, ptr nocapture %pDst, i32 %N, i32 %M) {
|
|
; CHECK-LABEL: fms:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
|
; CHECK-NEXT: push {r4, r5, r7, lr}
|
|
; CHECK-NEXT: ldr r4, [sp, #16]
|
|
; CHECK-NEXT: lsrs r5, r4, #2
|
|
; CHECK-NEXT: beq .LBB18_5
|
|
; CHECK-NEXT: @ %bb.1: @ %do.body.preheader
|
|
; CHECK-NEXT: ldr.w r12, [sp, #20]
|
|
; CHECK-NEXT: .LBB18_2: @ %do.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB18_3 Depth 2
|
|
; CHECK-NEXT: ldr r4, [r2]
|
|
; CHECK-NEXT: dls lr, r5
|
|
; CHECK-NEXT: vdup.32 q0, r4
|
|
; CHECK-NEXT: .LBB18_3: @ %while.body
|
|
; CHECK-NEXT: @ Parent Loop BB18_2 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: vldrw.u32 q2, [r0], #16
|
|
; CHECK-NEXT: vfms.f32 q2, q1, q0
|
|
; CHECK-NEXT: vstrb.8 q2, [r3], #16
|
|
; CHECK-NEXT: le lr, .LBB18_3
|
|
; CHECK-NEXT: @ %bb.4: @ %while.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB18_2 Depth=1
|
|
; CHECK-NEXT: subs.w r12, r12, #1
|
|
; CHECK-NEXT: add.w r2, r2, #4
|
|
; CHECK-NEXT: bne .LBB18_2
|
|
; CHECK-NEXT: .LBB18_5: @ %do.end
|
|
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
|
entry:
|
|
%shr = lshr i32 %N, 2
|
|
%cmp15 = icmp eq i32 %shr, 0
|
|
br i1 %cmp15, label %do.end, label %do.body
|
|
|
|
do.body: ; preds = %while.end, %entry
|
|
%pDst.addr.0 = phi ptr [ %add.ptr2, %while.end ], [ %pDst, %entry ]
|
|
%M.addr.0 = phi i32 [ %dec3, %while.end ], [ %M, %entry ]
|
|
%pSrc3.addr.0 = phi ptr [ %incdec.ptr, %while.end ], [ %pSrc3, %entry ]
|
|
%pSrc2.addr.0 = phi ptr [ %add.ptr1, %while.end ], [ %pSrc2, %entry ]
|
|
%pSrc1.addr.0 = phi ptr [ %add.ptr, %while.end ], [ %pSrc1, %entry ]
|
|
%i = load float, ptr %pSrc3.addr.0, align 4
|
|
%.splatinsert = insertelement <4 x float> undef, float %i, i32 0
|
|
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
|
|
br label %while.body
|
|
|
|
while.body: ; preds = %while.body, %do.body
|
|
%pSrc1.addr.119 = phi ptr [ %pSrc1.addr.0, %do.body ], [ %add.ptr, %while.body ]
|
|
%pSrc2.addr.118 = phi ptr [ %pSrc2.addr.0, %do.body ], [ %add.ptr1, %while.body ]
|
|
%blkCnt.017 = phi i32 [ %shr, %do.body ], [ %dec, %while.body ]
|
|
%pDst.addr.116 = phi ptr [ %pDst.addr.0, %do.body ], [ %add.ptr2, %while.body ]
|
|
%i2 = load <4 x float>, ptr %pSrc1.addr.119, align 4
|
|
%i4 = load <4 x float>, ptr %pSrc2.addr.118, align 4
|
|
%i5 = fneg fast <4 x float> %i4
|
|
%i6 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %.splat, <4 x float> %i5, <4 x float> %i2)
|
|
store <4 x float> %i6, ptr %pDst.addr.116, align 4
|
|
%add.ptr = getelementptr inbounds float, ptr %pSrc1.addr.119, i32 4
|
|
%add.ptr1 = getelementptr inbounds float, ptr %pSrc2.addr.118, i32 4
|
|
%add.ptr2 = getelementptr inbounds float, ptr %pDst.addr.116, i32 4
|
|
%dec = add nsw i32 %blkCnt.017, -1
|
|
%cmp = icmp eq i32 %dec, 0
|
|
br i1 %cmp, label %while.end, label %while.body
|
|
|
|
while.end: ; preds = %while.body
|
|
%incdec.ptr = getelementptr inbounds float, ptr %pSrc3.addr.0, i32 1
|
|
%dec3 = add i32 %M.addr.0, -1
|
|
%cmp4 = icmp eq i32 %dec3, 0
|
|
br i1 %cmp4, label %do.end, label %do.body
|
|
|
|
do.end: ; preds = %while.end, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
%struct.arm_biquad_casd_df1_inst_f32 = type { i32, ptr, ptr }
|
|
define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
|
|
; CHECK-LABEL: arm_biquad_cascade_df1_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #4
|
|
; CHECK-NEXT: sub sp, #4
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: .pad #16
|
|
; CHECK-NEXT: sub sp, #16
|
|
; CHECK-NEXT: ldrd r7, r9, [r0]
|
|
; CHECK-NEXT: and r6, r3, #3
|
|
; CHECK-NEXT: ldr r0, [r0, #8]
|
|
; CHECK-NEXT: lsrs r3, r3, #2
|
|
; CHECK-NEXT: @ implicit-def: $r12
|
|
; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
|
|
; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
|
|
; CHECK-NEXT: b .LBB19_3
|
|
; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
|
|
; CHECK-NEXT: mov r3, r8
|
|
; CHECK-NEXT: mov r2, r5
|
|
; CHECK-NEXT: mov r4, r11
|
|
; CHECK-NEXT: mov r8, r10
|
|
; CHECK-NEXT: .LBB19_2: @ %if.end69
|
|
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
|
|
; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
|
|
; CHECK-NEXT: adds r0, #128
|
|
; CHECK-NEXT: strd r2, r4, [r9]
|
|
; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: subs r7, #1
|
|
; CHECK-NEXT: strd r3, r8, [r9, #8]
|
|
; CHECK-NEXT: add.w r9, r9, #16
|
|
; CHECK-NEXT: mov r1, r2
|
|
; CHECK-NEXT: beq.w .LBB19_13
|
|
; CHECK-NEXT: .LBB19_3: @ %do.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB19_5 Depth 2
|
|
; CHECK-NEXT: mov r6, r2
|
|
; CHECK-NEXT: ldrd r5, r11, [r9]
|
|
; CHECK-NEXT: ldrd r8, r10, [r9, #8]
|
|
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
|
|
; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill
|
|
; CHECK-NEXT: wls lr, r2, .LBB19_6
|
|
; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph
|
|
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
|
|
; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: mov r4, r11
|
|
; CHECK-NEXT: mov r3, r5
|
|
; CHECK-NEXT: .LBB19_5: @ %while.body
|
|
; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: ldr r5, [r1, #12]
|
|
; CHECK-NEXT: vldrw.u32 q2, [r0]
|
|
; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
|
|
; CHECK-NEXT: ldm.w r1, {r2, r7, r11}
|
|
; CHECK-NEXT: vmul.f32 q2, q2, r5
|
|
; CHECK-NEXT: vldrw.u32 q7, [r0, #32]
|
|
; CHECK-NEXT: vfma.f32 q2, q6, r11
|
|
; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
|
|
; CHECK-NEXT: vfma.f32 q2, q7, r7
|
|
; CHECK-NEXT: vldrw.u32 q5, [r0, #64]
|
|
; CHECK-NEXT: vfma.f32 q2, q4, r2
|
|
; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
|
|
; CHECK-NEXT: vfma.f32 q2, q5, r3
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
|
|
; CHECK-NEXT: vfma.f32 q2, q3, r4
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
|
|
; CHECK-NEXT: vfma.f32 q2, q1, r8
|
|
; CHECK-NEXT: adds r1, #16
|
|
; CHECK-NEXT: vfma.f32 q2, q0, r10
|
|
; CHECK-NEXT: mov r4, r11
|
|
; CHECK-NEXT: vmov r10, r8, d5
|
|
; CHECK-NEXT: vstrb.8 q2, [r6], #16
|
|
; CHECK-NEXT: mov r3, r5
|
|
; CHECK-NEXT: mov r12, r5
|
|
; CHECK-NEXT: le lr, .LBB19_5
|
|
; CHECK-NEXT: .LBB19_6: @ %while.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
|
|
; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: cmp r3, #0
|
|
; CHECK-NEXT: beq .LBB19_1
|
|
; CHECK-NEXT: @ %bb.7: @ %if.then
|
|
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
|
|
; CHECK-NEXT: ldrd lr, r4, [r1]
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
|
; CHECK-NEXT: ldrd r2, r1, [r1, #8]
|
|
; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
|
|
; CHECK-NEXT: vldrw.u32 q7, [r0, #32]
|
|
; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
|
|
; CHECK-NEXT: vmul.f32 q0, q0, r1
|
|
; CHECK-NEXT: vldrw.u32 q5, [r0, #64]
|
|
; CHECK-NEXT: vfma.f32 q0, q6, r2
|
|
; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
|
|
; CHECK-NEXT: vfma.f32 q0, q7, r4
|
|
; CHECK-NEXT: vldrw.u32 q2, [r0, #96]
|
|
; CHECK-NEXT: vfma.f32 q0, q4, lr
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0, #112]
|
|
; CHECK-NEXT: vfma.f32 q0, q5, r5
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: vfma.f32 q0, q3, r11
|
|
; CHECK-NEXT: vfma.f32 q0, q2, r8
|
|
; CHECK-NEXT: vfma.f32 q0, q1, r10
|
|
; CHECK-NEXT: vmov r5, s0
|
|
; CHECK-NEXT: bne .LBB19_9
|
|
; CHECK-NEXT: @ %bb.8: @ %if.then58
|
|
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
|
|
; CHECK-NEXT: str r5, [r6]
|
|
; CHECK-NEXT: mov r2, lr
|
|
; CHECK-NEXT: mov r4, r12
|
|
; CHECK-NEXT: mov r3, r5
|
|
; CHECK-NEXT: b .LBB19_12
|
|
; CHECK-NEXT: .LBB19_9: @ %if.else
|
|
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
|
|
; CHECK-NEXT: vmov r8, s1
|
|
; CHECK-NEXT: cmp r3, #2
|
|
; CHECK-NEXT: vstr s1, [r6, #4]
|
|
; CHECK-NEXT: str r5, [r6]
|
|
; CHECK-NEXT: bne .LBB19_11
|
|
; CHECK-NEXT: @ %bb.10: @ in Loop: Header=BB19_3 Depth=1
|
|
; CHECK-NEXT: mov r2, r4
|
|
; CHECK-NEXT: mov r3, r8
|
|
; CHECK-NEXT: mov r4, lr
|
|
; CHECK-NEXT: mov r8, r5
|
|
; CHECK-NEXT: b .LBB19_12
|
|
; CHECK-NEXT: .LBB19_11: @ %if.else64
|
|
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
|
|
; CHECK-NEXT: vmov r3, s2
|
|
; CHECK-NEXT: vstr s2, [r6, #8]
|
|
; CHECK-NEXT: .LBB19_12: @ %if.end69
|
|
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
|
|
; CHECK-NEXT: mov r12, r1
|
|
; CHECK-NEXT: b .LBB19_2
|
|
; CHECK-NEXT: .LBB19_13: @ %do.end
|
|
; CHECK-NEXT: add sp, #16
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: add sp, #4
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%pState1 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, ptr %S, i32 0, i32 1
|
|
%i = load ptr, ptr %pState1, align 4
|
|
%pCoeffs2 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, ptr %S, i32 0, i32 2
|
|
%i1 = load ptr, ptr %pCoeffs2, align 4
|
|
%numStages = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, ptr %S, i32 0, i32 0
|
|
%i2 = load i32, ptr %numStages, align 4
|
|
%shr = lshr i32 %blockSize, 2
|
|
%cmp201 = icmp eq i32 %shr, 0
|
|
%and = and i32 %blockSize, 3
|
|
%tobool = icmp eq i32 %and, 0
|
|
%cmp57 = icmp eq i32 %and, 1
|
|
%cmp60 = icmp eq i32 %and, 2
|
|
br label %do.body
|
|
|
|
do.body: ; preds = %if.end69, %entry
|
|
%pState.0 = phi ptr [ %i, %entry ], [ %incdec.ptr73, %if.end69 ]
|
|
%pCoeffs.0 = phi ptr [ %i1, %entry ], [ %add.ptr74, %if.end69 ]
|
|
%pIn.0 = phi ptr [ %pSrc, %entry ], [ %pDst, %if.end69 ]
|
|
%X3.0 = phi float [ undef, %entry ], [ %X3.2, %if.end69 ]
|
|
%stage.0 = phi i32 [ %i2, %entry ], [ %dec75, %if.end69 ]
|
|
%i3 = load float, ptr %pState.0, align 4
|
|
%arrayidx3 = getelementptr inbounds float, ptr %pState.0, i32 1
|
|
%i4 = load float, ptr %arrayidx3, align 4
|
|
%arrayidx4 = getelementptr inbounds float, ptr %pState.0, i32 2
|
|
%i5 = load float, ptr %arrayidx4, align 4
|
|
%arrayidx5 = getelementptr inbounds float, ptr %pState.0, i32 3
|
|
%i6 = load float, ptr %arrayidx5, align 4
|
|
br i1 %cmp201, label %while.end, label %while.body.lr.ph
|
|
|
|
while.body.lr.ph: ; preds = %do.body
|
|
%arrayidx9 = getelementptr inbounds float, ptr %pCoeffs.0, i32 4
|
|
%arrayidx12 = getelementptr inbounds float, ptr %pCoeffs.0, i32 8
|
|
%arrayidx15 = getelementptr inbounds float, ptr %pCoeffs.0, i32 12
|
|
%arrayidx18 = getelementptr inbounds float, ptr %pCoeffs.0, i32 16
|
|
%arrayidx21 = getelementptr inbounds float, ptr %pCoeffs.0, i32 20
|
|
%arrayidx24 = getelementptr inbounds float, ptr %pCoeffs.0, i32 24
|
|
%arrayidx27 = getelementptr inbounds float, ptr %pCoeffs.0, i32 28
|
|
br label %while.body
|
|
|
|
while.body: ; preds = %while.body, %while.body.lr.ph
|
|
%sample.0208 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
|
|
%pIn.1207 = phi ptr [ %pIn.0, %while.body.lr.ph ], [ %incdec.ptr8, %while.body ]
|
|
%pOut.1206 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr, %while.body ]
|
|
%Yn2.0205 = phi float [ %i6, %while.body.lr.ph ], [ %i37, %while.body ]
|
|
%Yn1.0204 = phi float [ %i5, %while.body.lr.ph ], [ %i36, %while.body ]
|
|
%Xn2.0203 = phi float [ %i4, %while.body.lr.ph ], [ %i17, %while.body ]
|
|
%Xn1.0202 = phi float [ %i3, %while.body.lr.ph ], [ %i18, %while.body ]
|
|
%incdec.ptr = getelementptr inbounds float, ptr %pIn.1207, i32 1
|
|
%i15 = load float, ptr %pIn.1207, align 4
|
|
%incdec.ptr6 = getelementptr inbounds float, ptr %pIn.1207, i32 2
|
|
%i16 = load float, ptr %incdec.ptr, align 4
|
|
%incdec.ptr7 = getelementptr inbounds float, ptr %pIn.1207, i32 3
|
|
%i17 = load float, ptr %incdec.ptr6, align 4
|
|
%incdec.ptr8 = getelementptr inbounds float, ptr %pIn.1207, i32 4
|
|
%i18 = load float, ptr %incdec.ptr7, align 4
|
|
%i19 = load <4 x float>, ptr %pCoeffs.0, align 4
|
|
%.splatinsert = insertelement <4 x float> undef, float %i18, i32 0
|
|
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i20 = fmul fast <4 x float> %.splat, %i19
|
|
%i21 = load <4 x float>, ptr %arrayidx9, align 4
|
|
%.splatinsert10 = insertelement <4 x float> undef, float %i17, i32 0
|
|
%.splat11 = shufflevector <4 x float> %.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i22 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i21, <4 x float> %.splat11, <4 x float> %i20)
|
|
%i23 = load <4 x float>, ptr %arrayidx12, align 4
|
|
%.splatinsert13 = insertelement <4 x float> undef, float %i16, i32 0
|
|
%.splat14 = shufflevector <4 x float> %.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i24 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i23, <4 x float> %.splat14, <4 x float> %i22)
|
|
%i25 = load <4 x float>, ptr %arrayidx15, align 4
|
|
%.splatinsert16 = insertelement <4 x float> undef, float %i15, i32 0
|
|
%.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i25, <4 x float> %.splat17, <4 x float> %i24)
|
|
%i27 = load <4 x float>, ptr %arrayidx18, align 4
|
|
%.splatinsert19 = insertelement <4 x float> undef, float %Xn1.0202, i32 0
|
|
%.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i28 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i27, <4 x float> %.splat20, <4 x float> %i26)
|
|
%i29 = load <4 x float>, ptr %arrayidx21, align 4
|
|
%.splatinsert22 = insertelement <4 x float> undef, float %Xn2.0203, i32 0
|
|
%.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i30 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i29, <4 x float> %.splat23, <4 x float> %i28)
|
|
%i31 = load <4 x float>, ptr %arrayidx24, align 4
|
|
%.splatinsert25 = insertelement <4 x float> undef, float %Yn1.0204, i32 0
|
|
%.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i31, <4 x float> %.splat26, <4 x float> %i30)
|
|
%i33 = load <4 x float>, ptr %arrayidx27, align 4
|
|
%.splatinsert28 = insertelement <4 x float> undef, float %Yn2.0205, i32 0
|
|
%.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i34 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i33, <4 x float> %.splat29, <4 x float> %i32)
|
|
store <4 x float> %i34, ptr %pOut.1206, align 4
|
|
%add.ptr = getelementptr inbounds float, ptr %pOut.1206, i32 4
|
|
%i36 = extractelement <4 x float> %i34, i32 3
|
|
%i37 = extractelement <4 x float> %i34, i32 2
|
|
%dec = add nsw i32 %sample.0208, -1
|
|
%cmp = icmp eq i32 %dec, 0
|
|
br i1 %cmp, label %while.end, label %while.body
|
|
|
|
while.end: ; preds = %while.body, %do.body
|
|
%Xn1.0.lcssa = phi float [ %i3, %do.body ], [ %i18, %while.body ]
|
|
%Xn2.0.lcssa = phi float [ %i4, %do.body ], [ %i17, %while.body ]
|
|
%Yn1.0.lcssa = phi float [ %i5, %do.body ], [ %i36, %while.body ]
|
|
%Yn2.0.lcssa = phi float [ %i6, %do.body ], [ %i37, %while.body ]
|
|
%pOut.1.lcssa = phi ptr [ %pDst, %do.body ], [ %add.ptr, %while.body ]
|
|
%pIn.1.lcssa = phi ptr [ %pIn.0, %do.body ], [ %incdec.ptr8, %while.body ]
|
|
%X3.1.lcssa = phi float [ %X3.0, %do.body ], [ %i18, %while.body ]
|
|
br i1 %tobool, label %if.end69, label %if.then
|
|
|
|
if.then: ; preds = %while.end
|
|
%incdec.ptr30 = getelementptr inbounds float, ptr %pIn.1.lcssa, i32 1
|
|
%i38 = load float, ptr %pIn.1.lcssa, align 4
|
|
%incdec.ptr31 = getelementptr inbounds float, ptr %pIn.1.lcssa, i32 2
|
|
%i39 = load float, ptr %incdec.ptr30, align 4
|
|
%incdec.ptr32 = getelementptr inbounds float, ptr %pIn.1.lcssa, i32 3
|
|
%i40 = load float, ptr %incdec.ptr31, align 4
|
|
%i41 = load float, ptr %incdec.ptr32, align 4
|
|
%i43 = load <4 x float>, ptr %pCoeffs.0, align 4
|
|
%.splatinsert34 = insertelement <4 x float> undef, float %i41, i32 0
|
|
%.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i44 = fmul fast <4 x float> %.splat35, %i43
|
|
%arrayidx36 = getelementptr inbounds float, ptr %pCoeffs.0, i32 4
|
|
%i46 = load <4 x float>, ptr %arrayidx36, align 4
|
|
%.splatinsert37 = insertelement <4 x float> undef, float %i40, i32 0
|
|
%.splat38 = shufflevector <4 x float> %.splatinsert37, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i47 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i46, <4 x float> %.splat38, <4 x float> %i44)
|
|
%arrayidx39 = getelementptr inbounds float, ptr %pCoeffs.0, i32 8
|
|
%i49 = load <4 x float>, ptr %arrayidx39, align 4
|
|
%.splatinsert40 = insertelement <4 x float> undef, float %i39, i32 0
|
|
%.splat41 = shufflevector <4 x float> %.splatinsert40, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i50 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i49, <4 x float> %.splat41, <4 x float> %i47)
|
|
%arrayidx42 = getelementptr inbounds float, ptr %pCoeffs.0, i32 12
|
|
%i52 = load <4 x float>, ptr %arrayidx42, align 4
|
|
%.splatinsert43 = insertelement <4 x float> undef, float %i38, i32 0
|
|
%.splat44 = shufflevector <4 x float> %.splatinsert43, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i53 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i52, <4 x float> %.splat44, <4 x float> %i50)
|
|
%arrayidx45 = getelementptr inbounds float, ptr %pCoeffs.0, i32 16
|
|
%i55 = load <4 x float>, ptr %arrayidx45, align 4
|
|
%.splatinsert46 = insertelement <4 x float> undef, float %Xn1.0.lcssa, i32 0
|
|
%.splat47 = shufflevector <4 x float> %.splatinsert46, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i56 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i55, <4 x float> %.splat47, <4 x float> %i53)
|
|
%arrayidx48 = getelementptr inbounds float, ptr %pCoeffs.0, i32 20
|
|
%i58 = load <4 x float>, ptr %arrayidx48, align 4
|
|
%.splatinsert49 = insertelement <4 x float> undef, float %Xn2.0.lcssa, i32 0
|
|
%.splat50 = shufflevector <4 x float> %.splatinsert49, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i59 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i58, <4 x float> %.splat50, <4 x float> %i56)
|
|
%arrayidx51 = getelementptr inbounds float, ptr %pCoeffs.0, i32 24
|
|
%i61 = load <4 x float>, ptr %arrayidx51, align 4
|
|
%.splatinsert52 = insertelement <4 x float> undef, float %Yn1.0.lcssa, i32 0
|
|
%.splat53 = shufflevector <4 x float> %.splatinsert52, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i62 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i61, <4 x float> %.splat53, <4 x float> %i59)
|
|
%arrayidx54 = getelementptr inbounds float, ptr %pCoeffs.0, i32 28
|
|
%i64 = load <4 x float>, ptr %arrayidx54, align 4
|
|
%.splatinsert55 = insertelement <4 x float> undef, float %Yn2.0.lcssa, i32 0
|
|
%.splat56 = shufflevector <4 x float> %.splatinsert55, <4 x float> undef, <4 x i32> zeroinitializer
|
|
%i65 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i64, <4 x float> %.splat56, <4 x float> %i62)
|
|
%i66 = extractelement <4 x float> %i65, i32 0
|
|
br i1 %cmp57, label %if.then58, label %if.else
|
|
|
|
if.then58: ; preds = %if.then
|
|
store float %i66, ptr %pOut.1.lcssa, align 4
|
|
br label %if.end69
|
|
|
|
if.else: ; preds = %if.then
|
|
%incdec.ptr62 = getelementptr inbounds float, ptr %pOut.1.lcssa, i32 1
|
|
store float %i66, ptr %pOut.1.lcssa, align 4
|
|
%i67 = extractelement <4 x float> %i65, i32 1
|
|
store float %i67, ptr %incdec.ptr62, align 4
|
|
br i1 %cmp60, label %if.end69, label %if.else64
|
|
|
|
if.else64: ; preds = %if.else
|
|
%incdec.ptr63 = getelementptr inbounds float, ptr %pOut.1.lcssa, i32 2
|
|
%i68 = extractelement <4 x float> %i65, i32 2
|
|
store float %i68, ptr %incdec.ptr63, align 4
|
|
br label %if.end69
|
|
|
|
if.end69: ; preds = %if.else64, %if.else, %if.then58, %while.end
|
|
%Xn1.1 = phi float [ %i38, %if.then58 ], [ %i40, %if.else64 ], [ %Xn1.0.lcssa, %while.end ], [ %i39, %if.else ]
|
|
%Xn2.1 = phi float [ %X3.1.lcssa, %if.then58 ], [ %i39, %if.else64 ], [ %Xn2.0.lcssa, %while.end ], [ %i38, %if.else ]
|
|
%Yn1.1 = phi float [ %i66, %if.then58 ], [ %i68, %if.else64 ], [ %Yn1.0.lcssa, %while.end ], [ %i67, %if.else ]
|
|
%Yn2.1 = phi float [ %Yn1.0.lcssa, %if.then58 ], [ %i67, %if.else64 ], [ %Yn2.0.lcssa, %while.end ], [ %i66, %if.else ]
|
|
%X3.2 = phi float [ %i41, %if.then58 ], [ %i41, %if.else64 ], [ %X3.1.lcssa, %while.end ], [ %i41, %if.else ]
|
|
store float %Xn1.1, ptr %pState.0, align 4
|
|
store float %Xn2.1, ptr %arrayidx3, align 4
|
|
store float %Yn1.1, ptr %arrayidx4, align 4
|
|
%incdec.ptr73 = getelementptr inbounds float, ptr %pState.0, i32 4
|
|
store float %Yn2.1, ptr %arrayidx5, align 4
|
|
%add.ptr74 = getelementptr inbounds float, ptr %pCoeffs.0, i32 32
|
|
%dec75 = add i32 %stage.0, -1
|
|
%cmp76 = icmp eq i32 %dec75, 0
|
|
br i1 %cmp76, label %do.end, label %do.body
|
|
|
|
do.end: ; preds = %if.end69
|
|
ret void
|
|
}
|
|
|
|
|
|
%struct.arm_biquad_cascade_df2T_instance_f32 = type { i8, ptr, ptr }
|
|
define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
|
|
; CHECK-LABEL: arm_biquad_cascade_df2T_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
|
|
; CHECK-NEXT: ldrd r6, r12, [r0, #4]
|
|
; CHECK-NEXT: lsr.w r8, r3, #1
|
|
; CHECK-NEXT: ldrb r0, [r0]
|
|
; CHECK-NEXT: vldr s0, .LCPI20_0
|
|
; CHECK-NEXT: b .LBB20_3
|
|
; CHECK-NEXT: .LBB20_1: @ %if.else
|
|
; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1
|
|
; CHECK-NEXT: vmov.f32 s6, s5
|
|
; CHECK-NEXT: vstr s4, [r6]
|
|
; CHECK-NEXT: .LBB20_2: @ %if.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1
|
|
; CHECK-NEXT: vstr s6, [r6, #4]
|
|
; CHECK-NEXT: add.w r12, r12, #20
|
|
; CHECK-NEXT: subs r0, #1
|
|
; CHECK-NEXT: add.w r6, r6, #8
|
|
; CHECK-NEXT: mov r1, r2
|
|
; CHECK-NEXT: beq .LBB20_8
|
|
; CHECK-NEXT: .LBB20_3: @ %do.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB20_5 Depth 2
|
|
; CHECK-NEXT: vldrw.u32 q3, [r12]
|
|
; CHECK-NEXT: movs r5, #0
|
|
; CHECK-NEXT: vmov q4, q3
|
|
; CHECK-NEXT: vshlc q4, r5, #32
|
|
; CHECK-NEXT: vldrw.u32 q2, [r12, #8]
|
|
; CHECK-NEXT: vmov q5, q2
|
|
; CHECK-NEXT: vshlc q5, r5, #32
|
|
; CHECK-NEXT: vldrw.u32 q1, [r6]
|
|
; CHECK-NEXT: vmov.f32 s6, s0
|
|
; CHECK-NEXT: mov r5, r2
|
|
; CHECK-NEXT: vmov.f32 s7, s0
|
|
; CHECK-NEXT: wls lr, r8, .LBB20_6
|
|
; CHECK-NEXT: @ %bb.4: @ %while.body.preheader
|
|
; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1
|
|
; CHECK-NEXT: vmov q6, q1
|
|
; CHECK-NEXT: mov r5, r2
|
|
; CHECK-NEXT: .LBB20_5: @ %while.body
|
|
; CHECK-NEXT: @ Parent Loop BB20_3 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: ldrd r7, r4, [r1], #8
|
|
; CHECK-NEXT: vfma.f32 q6, q3, r7
|
|
; CHECK-NEXT: vmov r7, s24
|
|
; CHECK-NEXT: vmov q1, q6
|
|
; CHECK-NEXT: vfma.f32 q1, q2, r7
|
|
; CHECK-NEXT: vstr s24, [r5]
|
|
; CHECK-NEXT: vmov.f32 s7, s0
|
|
; CHECK-NEXT: vfma.f32 q1, q4, r4
|
|
; CHECK-NEXT: vmov r4, s5
|
|
; CHECK-NEXT: vstr s5, [r5, #4]
|
|
; CHECK-NEXT: vfma.f32 q1, q5, r4
|
|
; CHECK-NEXT: adds r5, #8
|
|
; CHECK-NEXT: vmov.f32 s4, s6
|
|
; CHECK-NEXT: vmov.f32 s5, s7
|
|
; CHECK-NEXT: vmov.f32 s6, s0
|
|
; CHECK-NEXT: vmov q6, q1
|
|
; CHECK-NEXT: le lr, .LBB20_5
|
|
; CHECK-NEXT: .LBB20_6: @ %while.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1
|
|
; CHECK-NEXT: lsls r7, r3, #31
|
|
; CHECK-NEXT: beq .LBB20_1
|
|
; CHECK-NEXT: @ %bb.7: @ %if.then
|
|
; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1
|
|
; CHECK-NEXT: ldr r1, [r1]
|
|
; CHECK-NEXT: vfma.f32 q1, q3, r1
|
|
; CHECK-NEXT: vmov r1, s4
|
|
; CHECK-NEXT: vstr s4, [r5]
|
|
; CHECK-NEXT: vfma.f32 q1, q2, r1
|
|
; CHECK-NEXT: vstr s5, [r6]
|
|
; CHECK-NEXT: b .LBB20_2
|
|
; CHECK-NEXT: .LBB20_8: @ %do.end
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.9:
|
|
; CHECK-NEXT: .LCPI20_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
entry:
|
|
%pState1 = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f32, ptr %S, i32 0, i32 1
|
|
%i = load ptr, ptr %pState1, align 4
|
|
%numStages = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f32, ptr %S, i32 0, i32 0
|
|
%i1 = load i8, ptr %numStages, align 4
|
|
%conv = zext i8 %i1 to i32
|
|
%pCoeffs = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f32, ptr %S, i32 0, i32 2
|
|
%i2 = load ptr, ptr %pCoeffs, align 4
|
|
%div = lshr i32 %blockSize, 1
|
|
%cmp.not90 = icmp eq i32 %div, 0
|
|
%and = and i32 %blockSize, 1
|
|
%tobool.not = icmp eq i32 %and, 0
|
|
br label %do.body
|
|
|
|
do.body: ; preds = %if.end, %entry
|
|
%stage.0 = phi i32 [ %conv, %entry ], [ %dec23, %if.end ]
|
|
%pCurCoeffs.0 = phi ptr [ %i2, %entry ], [ %add.ptr2, %if.end ]
|
|
%pState.0 = phi ptr [ %i, %entry ], [ %pState.1, %if.end ]
|
|
%pIn.0 = phi ptr [ %pSrc, %entry ], [ %pDst, %if.end ]
|
|
%i4 = load <4 x float>, ptr %pCurCoeffs.0, align 4
|
|
%add.ptr = getelementptr inbounds float, ptr %pCurCoeffs.0, i32 2
|
|
%i6 = load <4 x float>, ptr %add.ptr, align 4
|
|
%add.ptr2 = getelementptr inbounds float, ptr %pCurCoeffs.0, i32 5
|
|
%i8 = load <4 x float>, ptr %pState.0, align 8
|
|
%i9 = shufflevector <4 x float> %i8, <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
|
|
%i10 = bitcast <4 x float> %i4 to <4 x i32>
|
|
%i11 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %i10, i32 0, i32 32)
|
|
%i12 = extractvalue { i32, <4 x i32> } %i11, 0
|
|
%i13 = extractvalue { i32, <4 x i32> } %i11, 1
|
|
%i14 = bitcast <4 x i32> %i13 to <4 x float>
|
|
%i15 = bitcast <4 x float> %i6 to <4 x i32>
|
|
%i16 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %i15, i32 %i12, i32 32)
|
|
%i17 = extractvalue { i32, <4 x i32> } %i16, 1
|
|
%i18 = bitcast <4 x i32> %i17 to <4 x float>
|
|
br i1 %cmp.not90, label %while.end, label %while.body
|
|
|
|
while.body: ; preds = %while.body, %do.body
|
|
%pIn.194 = phi ptr [ %incdec.ptr4, %while.body ], [ %pIn.0, %do.body ]
|
|
%state.093 = phi <4 x float> [ %i30, %while.body ], [ %i9, %do.body ]
|
|
%pOut.192 = phi ptr [ %incdec.ptr12, %while.body ], [ %pDst, %do.body ]
|
|
%sample.091 = phi i32 [ %dec, %while.body ], [ %div, %do.body ]
|
|
%incdec.ptr = getelementptr inbounds float, ptr %pIn.194, i32 1
|
|
%i19 = load float, ptr %pIn.194, align 4
|
|
%incdec.ptr4 = getelementptr inbounds float, ptr %pIn.194, i32 2
|
|
%i20 = load float, ptr %incdec.ptr, align 4
|
|
%.splatinsert = insertelement <4 x float> poison, float %i19, i32 0
|
|
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
|
|
%i21 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i4, <4 x float> %.splat, <4 x float> %state.093)
|
|
%i22 = extractelement <4 x float> %i21, i32 0
|
|
%.splat6 = shufflevector <4 x float> %i21, <4 x float> poison, <4 x i32> zeroinitializer
|
|
%i23 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i6, <4 x float> %.splat6, <4 x float> %i21)
|
|
%i24 = insertelement <4 x float> %i23, float 0.000000e+00, i32 3
|
|
%.splatinsert7 = insertelement <4 x float> poison, float %i20, i32 0
|
|
%.splat8 = shufflevector <4 x float> %.splatinsert7, <4 x float> poison, <4 x i32> zeroinitializer
|
|
%i25 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i14, <4 x float> %.splat8, <4 x float> %i24)
|
|
%i26 = extractelement <4 x float> %i25, i32 1
|
|
%.splat10 = shufflevector <4 x float> %i25, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
|
|
%i27 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i18, <4 x float> %.splat10, <4 x float> %i25)
|
|
%i28 = shufflevector <4 x float> %i27, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 3>
|
|
%i29 = insertelement <4 x float> %i28, float 0.000000e+00, i32 2
|
|
%i30 = shufflevector <4 x float> %i29, <4 x float> %i27, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
|
|
%incdec.ptr11 = getelementptr inbounds float, ptr %pOut.192, i32 1
|
|
store float %i22, ptr %pOut.192, align 4
|
|
%incdec.ptr12 = getelementptr inbounds float, ptr %pOut.192, i32 2
|
|
store float %i26, ptr %incdec.ptr11, align 4
|
|
%dec = add nsw i32 %sample.091, -1
|
|
%cmp.not = icmp eq i32 %dec, 0
|
|
br i1 %cmp.not, label %while.end, label %while.body
|
|
|
|
while.end: ; preds = %while.body, %do.body
|
|
%pOut.1.lcssa = phi ptr [ %pDst, %do.body ], [ %incdec.ptr12, %while.body ]
|
|
%state.0.lcssa = phi <4 x float> [ %i9, %do.body ], [ %i30, %while.body ]
|
|
%pIn.1.lcssa = phi ptr [ %pIn.0, %do.body ], [ %incdec.ptr4, %while.body ]
|
|
br i1 %tobool.not, label %if.else, label %if.then
|
|
|
|
if.then: ; preds = %while.end
|
|
%i31 = load float, ptr %pIn.1.lcssa, align 4
|
|
%.splatinsert14 = insertelement <4 x float> poison, float %i31, i32 0
|
|
%.splat15 = shufflevector <4 x float> %.splatinsert14, <4 x float> poison, <4 x i32> zeroinitializer
|
|
%i32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i4, <4 x float> %.splat15, <4 x float> %state.0.lcssa)
|
|
%i33 = extractelement <4 x float> %i32, i32 0
|
|
%.splat17 = shufflevector <4 x float> %i32, <4 x float> poison, <4 x i32> zeroinitializer
|
|
%i34 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i6, <4 x float> %.splat17, <4 x float> %i32)
|
|
store float %i33, ptr %pOut.1.lcssa, align 4
|
|
%i35 = extractelement <4 x float> %i34, i32 1
|
|
store float %i35, ptr %pState.0, align 4
|
|
%i36 = extractelement <4 x float> %i34, i32 2
|
|
br label %if.end
|
|
|
|
if.else: ; preds = %while.end
|
|
%i37 = extractelement <4 x float> %state.0.lcssa, i32 0
|
|
store float %i37, ptr %pState.0, align 4
|
|
%i38 = extractelement <4 x float> %state.0.lcssa, i32 1
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %if.else, %if.then
|
|
%.sink = phi float [ %i38, %if.else ], [ %i36, %if.then ]
|
|
%i39 = getelementptr inbounds float, ptr %pState.0, i32 1
|
|
store float %.sink, ptr %i39, align 4
|
|
%pState.1 = getelementptr inbounds float, ptr %pState.0, i32 2
|
|
%dec23 = add i32 %stage.0, -1
|
|
%cmp24.not = icmp eq i32 %dec23, 0
|
|
br i1 %cmp24.not, label %do.end, label %do.body
|
|
|
|
do.end: ; preds = %if.end
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc float @vecAddAcrossF32Mve(<4 x float> %in) {
|
|
; CHECK-LABEL: vecAddAcrossF32Mve:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s2
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s3
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = extractelement <4 x float> %in, i32 0
|
|
%i1 = extractelement <4 x float> %in, i32 1
|
|
%add = fadd fast float %i, %i1
|
|
%i2 = extractelement <4 x float> %in, i32 2
|
|
%add1 = fadd fast float %add, %i2
|
|
%i3 = extractelement <4 x float> %in, i32 3
|
|
%add2 = fadd fast float %add1, %i3
|
|
ret float %add2
|
|
}
|
|
|
|
|
|
declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32>, i32, i32) #1
|
|
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
|
|
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
|
|
declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32, i32, i32)
|
|
declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr, <4 x i32>, i32, i32, i32)
|
|
declare void @llvm.assume(i1)
|
|
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
|
|
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
|
|
declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>)
|