We would like to start pushing -mcpu=generic towards enabling the set of
features that improves performance for some CPUs, without hurting any
others. A blend of the performance options hopefully beneficial to all
CPUs. The largest part of that is enabling in-order scheduling using the
Cortex-A55 schedule model. This is similar to the Arm backend change
from eecb353d0e which made -mcpu=generic perform in-order scheduling
using the cortex-a8 schedule model.
The idea is that in-order cpu's require the most help in instruction
scheduling, whereas out-of-order cpus can for the most part out-of-order
schedule around different codegen. Our benchmarking suggests that
hypothesis holds. When running on an in-order core this improved
performance by 3.8% geomean on a set of DSP workloads, 2% geomean on
some other embedded benchmark and between 1% and 1.8% on a set of
singlecore and multicore workloads, all running on a Cortex-A55 cluster.
On an out-of-order cpu the results are a lot more noisy but show flat
performance or an improvement. On the set of DSP and embedded
benchmarks, run on a Cortex-A78 there was a very noisy 1% speed
improvement. Using the most detailed results I could find, SPEC2006 runs
on a Neoverse N1 show a small increase in instruction count (+0.127%),
but a decrease in cycle counts (-0.155%, on average). The instruction
count is very low noise, the cycle count is more noisy with a 0.15%
decrease not being significant. SPEC2k17 shows a small decrease (-0.2%)
in instruction count leading to a -0.296% decrease in cycle count. These
results are within noise margins but tend to show a small improvement in
general.
When specifying an Apple target, clang will set "-target-cpu apple-a7"
on the command line, so should not be affected by this change when
running from clang. This also doesn't enable more runtime unrolling like
-mcpu=cortex-a55 does, only changing the schedule used.
A lot of existing tests have updated. This is a summary of the important
differences:
- Most changes are the same instructions in a different order.
- Sometimes this leads to very minor inefficiencies, such as requiring
an extra mov to move variables into r0/v0 for the return value of a test
function.
- misched-fusion.ll was no longer fusing the pairs of instructions it
should, as per D110561. I've changed the schedule used in the test
for now.
- neon-mla-mls.ll now uses "mul; sub" as opposed to "neg; mla" due to
the different latencies. This seems fine to me.
- Some SVE tests do not always remove movprfx where they did before due
to different register allocation giving different destructive forms.
- The tests argument-blocks-array-of-struct.ll and arm64-windows-calls.ll
produce two LDR where they previously produced an LDP due to
store-pair-suppress kicking in.
- arm64-ldp.ll and arm64-neon-copy.ll are missing pre/postinc on LPD.
- Some tests such as arm64-neon-mul-div.ll and
ragreedy-local-interval-cost.ll have more, less or just different
spilling.
- In aarch64_generated_funcs.ll.generated.expected one part of the
function is no longer outlined. Interestingly if I switch this to use
any other scheduled even less is outlined.
Some of these are expected to happen, such as differences in outlining
or register spilling. There will be places where these result in worse
codegen, places where they are better, with the SPEC instruction counts
suggesting it is not a decrease overall, on average.
Differential Revision: https://reviews.llvm.org/D110830
382 lines
14 KiB
LLVM
382 lines
14 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
|
|
|
|
declare {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32>, <1 x i32>)
|
|
declare {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32>, <2 x i32>)
|
|
declare {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32>, <3 x i32>)
|
|
declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>)
|
|
declare {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32>, <6 x i32>)
|
|
declare {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>)
|
|
|
|
declare {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>)
|
|
declare {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>)
|
|
declare {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>)
|
|
|
|
declare {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24>, <4 x i24>)
|
|
declare {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1>, <4 x i1>)
|
|
declare {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128>, <2 x i128>)
|
|
|
|
define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v1i32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
|
|
; CHECK-NEXT: shrn v0.2s, v1.2d, #32
|
|
; CHECK-NEXT: xtn v1.2s, v1.2d
|
|
; CHECK-NEXT: str s1, [x0]
|
|
; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s
|
|
; CHECK-NEXT: ret
|
|
%t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
|
|
%val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
|
|
%obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
|
|
%res = sext <1 x i1> %obit to <1 x i32>
|
|
store <1 x i32> %val, <1 x i32>* %p2
|
|
ret <1 x i32> %res
|
|
}
|
|
|
|
define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v2i32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
|
|
; CHECK-NEXT: shrn v0.2s, v1.2d, #32
|
|
; CHECK-NEXT: xtn v1.2s, v1.2d
|
|
; CHECK-NEXT: str d1, [x0]
|
|
; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s
|
|
; CHECK-NEXT: ret
|
|
%t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
|
|
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
|
|
%obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
|
|
%res = sext <2 x i1> %obit to <2 x i32>
|
|
store <2 x i32> %val, <2 x i32>* %p2
|
|
ret <2 x i32> %res
|
|
}
|
|
|
|
define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v3i32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
|
|
; CHECK-NEXT: add x8, x0, #8
|
|
; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s
|
|
; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
|
|
; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s
|
|
; CHECK-NEXT: st1 { v1.s }[2], [x8]
|
|
; CHECK-NEXT: str d1, [x0]
|
|
; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s
|
|
; CHECK-NEXT: mov v0.16b, v2.16b
|
|
; CHECK-NEXT: ret
|
|
%t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
|
|
%val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
|
|
%obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
|
|
%res = sext <3 x i1> %obit to <3 x i32>
|
|
store <3 x i32> %val, <3 x i32>* %p2
|
|
ret <3 x i32> %res
|
|
}
|
|
|
|
define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v4i32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
|
|
; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s
|
|
; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
|
|
; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s
|
|
; CHECK-NEXT: str q1, [x0]
|
|
; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s
|
|
; CHECK-NEXT: mov v0.16b, v2.16b
|
|
; CHECK-NEXT: ret
|
|
%t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
|
|
%val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
|
|
%obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
|
|
%res = sext <4 x i1> %obit to <4 x i32>
|
|
store <4 x i32> %val, <4 x i32>* %p2
|
|
ret <4 x i32> %res
|
|
}
|
|
|
|
define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v6i32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmov s0, w6
|
|
; CHECK-NEXT: fmov s1, w0
|
|
; CHECK-NEXT: mov x8, sp
|
|
; CHECK-NEXT: add x9, sp, #8
|
|
; CHECK-NEXT: ldr s2, [sp, #16]
|
|
; CHECK-NEXT: fmov s3, w4
|
|
; CHECK-NEXT: mov v0.s[1], w7
|
|
; CHECK-NEXT: mov v1.s[1], w1
|
|
; CHECK-NEXT: mov v3.s[1], w5
|
|
; CHECK-NEXT: ld1 { v0.s }[2], [x8]
|
|
; CHECK-NEXT: add x8, sp, #24
|
|
; CHECK-NEXT: mov v1.s[2], w2
|
|
; CHECK-NEXT: ld1 { v2.s }[1], [x8]
|
|
; CHECK-NEXT: ld1 { v0.s }[3], [x9]
|
|
; CHECK-NEXT: mov v1.s[3], w3
|
|
; CHECK-NEXT: ldr x8, [sp, #32]
|
|
; CHECK-NEXT: umull2 v6.2d, v3.4s, v2.4s
|
|
; CHECK-NEXT: umull v7.2d, v3.2s, v2.2s
|
|
; CHECK-NEXT: umull2 v4.2d, v1.4s, v0.4s
|
|
; CHECK-NEXT: umull v5.2d, v1.2s, v0.2s
|
|
; CHECK-NEXT: mul v2.4s, v3.4s, v2.4s
|
|
; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s
|
|
; CHECK-NEXT: uzp2 v4.4s, v5.4s, v4.4s
|
|
; CHECK-NEXT: uzp2 v5.4s, v7.4s, v6.4s
|
|
; CHECK-NEXT: str d2, [x8, #16]
|
|
; CHECK-NEXT: str q0, [x8]
|
|
; CHECK-NEXT: cmtst v4.4s, v4.4s, v4.4s
|
|
; CHECK-NEXT: cmtst v3.4s, v5.4s, v5.4s
|
|
; CHECK-NEXT: mov w1, v4.s[1]
|
|
; CHECK-NEXT: mov w2, v4.s[2]
|
|
; CHECK-NEXT: mov w3, v4.s[3]
|
|
; CHECK-NEXT: mov w5, v3.s[1]
|
|
; CHECK-NEXT: fmov w0, s4
|
|
; CHECK-NEXT: fmov w4, s3
|
|
; CHECK-NEXT: ret
|
|
%t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
|
|
%val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
|
|
%obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
|
|
%res = sext <6 x i1> %obit to <6 x i32>
|
|
store <6 x i32> %val, <6 x i32>* %p2
|
|
ret <6 x i32> %res
|
|
}
|
|
|
|
define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v8i32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: umull2 v4.2d, v1.4s, v3.4s
|
|
; CHECK-NEXT: umull2 v5.2d, v0.4s, v2.4s
|
|
; CHECK-NEXT: umull v6.2d, v0.2s, v2.2s
|
|
; CHECK-NEXT: umull v7.2d, v1.2s, v3.2s
|
|
; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s
|
|
; CHECK-NEXT: mul v2.4s, v0.4s, v2.4s
|
|
; CHECK-NEXT: uzp2 v5.4s, v6.4s, v5.4s
|
|
; CHECK-NEXT: uzp2 v6.4s, v7.4s, v4.4s
|
|
; CHECK-NEXT: stp q2, q3, [x0]
|
|
; CHECK-NEXT: cmtst v4.4s, v5.4s, v5.4s
|
|
; CHECK-NEXT: cmtst v5.4s, v6.4s, v6.4s
|
|
; CHECK-NEXT: mov v0.16b, v4.16b
|
|
; CHECK-NEXT: mov v1.16b, v5.16b
|
|
; CHECK-NEXT: ret
|
|
%t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
|
|
%val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
|
|
%obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
|
|
%res = sext <8 x i1> %obit to <8 x i32>
|
|
store <8 x i32> %val, <8 x i32>* %p2
|
|
ret <8 x i32> %res
|
|
}
|
|
|
|
define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v16i8:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
|
|
; CHECK-NEXT: umull v3.8h, v0.8b, v1.8b
|
|
; CHECK-NEXT: uzp2 v2.16b, v3.16b, v2.16b
|
|
; CHECK-NEXT: cmtst v2.16b, v2.16b, v2.16b
|
|
; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
|
|
; CHECK-NEXT: zip1 v4.8b, v2.8b, v0.8b
|
|
; CHECK-NEXT: zip2 v2.8b, v2.8b, v0.8b
|
|
; CHECK-NEXT: zip1 v5.8b, v3.8b, v0.8b
|
|
; CHECK-NEXT: zip2 v3.8b, v3.8b, v0.8b
|
|
; CHECK-NEXT: ushll v4.4s, v4.4h, #0
|
|
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
|
|
; CHECK-NEXT: ushll v5.4s, v5.4h, #0
|
|
; CHECK-NEXT: ushll v3.4s, v3.4h, #0
|
|
; CHECK-NEXT: shl v4.4s, v4.4s, #31
|
|
; CHECK-NEXT: shl v2.4s, v2.4s, #31
|
|
; CHECK-NEXT: shl v6.4s, v5.4s, #31
|
|
; CHECK-NEXT: shl v3.4s, v3.4s, #31
|
|
; CHECK-NEXT: sshr v4.4s, v4.4s, #31
|
|
; CHECK-NEXT: sshr v5.4s, v2.4s, #31
|
|
; CHECK-NEXT: sshr v2.4s, v6.4s, #31
|
|
; CHECK-NEXT: sshr v3.4s, v3.4s, #31
|
|
; CHECK-NEXT: mul v6.16b, v0.16b, v1.16b
|
|
; CHECK-NEXT: mov v0.16b, v4.16b
|
|
; CHECK-NEXT: mov v1.16b, v5.16b
|
|
; CHECK-NEXT: str q6, [x0]
|
|
; CHECK-NEXT: ret
|
|
%t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
|
|
%val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
|
|
%obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
|
|
%res = sext <16 x i1> %obit to <16 x i32>
|
|
store <16 x i8> %val, <16 x i8>* %p2
|
|
ret <16 x i32> %res
|
|
}
|
|
|
|
define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v8i16:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
|
|
; CHECK-NEXT: umull v3.4s, v0.4h, v1.4h
|
|
; CHECK-NEXT: uzp2 v2.8h, v3.8h, v2.8h
|
|
; CHECK-NEXT: cmtst v2.8h, v2.8h, v2.8h
|
|
; CHECK-NEXT: xtn v2.8b, v2.8h
|
|
; CHECK-NEXT: zip1 v3.8b, v2.8b, v0.8b
|
|
; CHECK-NEXT: zip2 v2.8b, v2.8b, v0.8b
|
|
; CHECK-NEXT: ushll v3.4s, v3.4h, #0
|
|
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
|
|
; CHECK-NEXT: shl v3.4s, v3.4s, #31
|
|
; CHECK-NEXT: shl v4.4s, v2.4s, #31
|
|
; CHECK-NEXT: sshr v2.4s, v3.4s, #31
|
|
; CHECK-NEXT: sshr v3.4s, v4.4s, #31
|
|
; CHECK-NEXT: mul v4.8h, v0.8h, v1.8h
|
|
; CHECK-NEXT: mov v0.16b, v2.16b
|
|
; CHECK-NEXT: mov v1.16b, v3.16b
|
|
; CHECK-NEXT: str q4, [x0]
|
|
; CHECK-NEXT: ret
|
|
%t = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
|
|
%val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
|
|
%obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
|
|
%res = sext <8 x i1> %obit to <8 x i32>
|
|
store <8 x i16> %val, <8 x i16>* %p2
|
|
ret <8 x i32> %res
|
|
}
|
|
|
|
define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v2i64:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: mov x8, v1.d[1]
|
|
; CHECK-NEXT: fmov x10, d1
|
|
; CHECK-NEXT: mov x9, v0.d[1]
|
|
; CHECK-NEXT: fmov x11, d0
|
|
; CHECK-NEXT: umulh x12, x9, x8
|
|
; CHECK-NEXT: umulh x13, x11, x10
|
|
; CHECK-NEXT: cmp xzr, x12
|
|
; CHECK-NEXT: mul x10, x11, x10
|
|
; CHECK-NEXT: csetm x12, ne
|
|
; CHECK-NEXT: cmp xzr, x13
|
|
; CHECK-NEXT: csetm x13, ne
|
|
; CHECK-NEXT: mul x8, x9, x8
|
|
; CHECK-NEXT: fmov d1, x10
|
|
; CHECK-NEXT: fmov d0, x13
|
|
; CHECK-NEXT: mov v1.d[1], x8
|
|
; CHECK-NEXT: mov v0.d[1], x12
|
|
; CHECK-NEXT: str q1, [x0]
|
|
; CHECK-NEXT: xtn v0.2s, v0.2d
|
|
; CHECK-NEXT: ret
|
|
%t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
|
|
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
|
|
%obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
|
|
%res = sext <2 x i1> %obit to <2 x i32>
|
|
store <2 x i64> %val, <2 x i64>* %p2
|
|
ret <2 x i32> %res
|
|
}
|
|
|
|
define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v4i24:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: bic v1.4s, #255, lsl #24
|
|
; CHECK-NEXT: bic v0.4s, #255, lsl #24
|
|
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
|
|
; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s
|
|
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
|
|
; CHECK-NEXT: uzp2 v1.4s, v3.4s, v2.4s
|
|
; CHECK-NEXT: ushr v2.4s, v0.4s, #24
|
|
; CHECK-NEXT: mov w8, v0.s[3]
|
|
; CHECK-NEXT: mov w9, v0.s[2]
|
|
; CHECK-NEXT: mov w10, v0.s[1]
|
|
; CHECK-NEXT: cmeq v1.4s, v1.4s, #0
|
|
; CHECK-NEXT: fmov w11, s0
|
|
; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s
|
|
; CHECK-NEXT: sturh w8, [x0, #9]
|
|
; CHECK-NEXT: lsr w8, w8, #16
|
|
; CHECK-NEXT: strh w9, [x0, #6]
|
|
; CHECK-NEXT: lsr w9, w9, #16
|
|
; CHECK-NEXT: sturh w10, [x0, #3]
|
|
; CHECK-NEXT: orn v0.16b, v2.16b, v1.16b
|
|
; CHECK-NEXT: strb w8, [x0, #11]
|
|
; CHECK-NEXT: lsr w8, w10, #16
|
|
; CHECK-NEXT: lsr w10, w11, #16
|
|
; CHECK-NEXT: strh w11, [x0]
|
|
; CHECK-NEXT: strb w9, [x0, #8]
|
|
; CHECK-NEXT: strb w8, [x0, #5]
|
|
; CHECK-NEXT: strb w10, [x0, #2]
|
|
; CHECK-NEXT: ret
|
|
%t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
|
|
%val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
|
|
%obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
|
|
%res = sext <4 x i1> %obit to <4 x i32>
|
|
store <4 x i24> %val, <4 x i24>* %p2
|
|
ret <4 x i32> %res
|
|
}
|
|
|
|
define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v4i1:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmov d2, d0
|
|
; CHECK-NEXT: movi v0.2d, #0000000000000000
|
|
; CHECK-NEXT: and v1.8b, v2.8b, v1.8b
|
|
; CHECK-NEXT: umov w8, v1.h[1]
|
|
; CHECK-NEXT: umov w9, v1.h[2]
|
|
; CHECK-NEXT: umov w10, v1.h[0]
|
|
; CHECK-NEXT: umov w11, v1.h[3]
|
|
; CHECK-NEXT: and w8, w8, #0x1
|
|
; CHECK-NEXT: and w9, w9, #0x1
|
|
; CHECK-NEXT: bfi w10, w8, #1, #1
|
|
; CHECK-NEXT: bfi w10, w9, #2, #1
|
|
; CHECK-NEXT: bfi w10, w11, #3, #29
|
|
; CHECK-NEXT: and w8, w10, #0xf
|
|
; CHECK-NEXT: strb w8, [x0]
|
|
; CHECK-NEXT: ret
|
|
%t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
|
|
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
|
|
%obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
|
|
%res = sext <4 x i1> %obit to <4 x i32>
|
|
store <4 x i1> %val, <4 x i1>* %p2
|
|
ret <4 x i32> %res
|
|
}
|
|
|
|
define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
|
|
; CHECK-LABEL: umulo_v2i128:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: cmp x7, #0
|
|
; CHECK-NEXT: umulh x8, x3, x6
|
|
; CHECK-NEXT: mul x10, x7, x2
|
|
; CHECK-NEXT: cset w9, ne
|
|
; CHECK-NEXT: cmp x3, #0
|
|
; CHECK-NEXT: umulh x11, x7, x2
|
|
; CHECK-NEXT: cset w12, ne
|
|
; CHECK-NEXT: madd x10, x3, x6, x10
|
|
; CHECK-NEXT: cmp xzr, x8
|
|
; CHECK-NEXT: umulh x8, x2, x6
|
|
; CHECK-NEXT: cset w13, ne
|
|
; CHECK-NEXT: cmp xzr, x11
|
|
; CHECK-NEXT: cset w11, ne
|
|
; CHECK-NEXT: adds x8, x8, x10
|
|
; CHECK-NEXT: cset w10, hs
|
|
; CHECK-NEXT: cmp x5, #0
|
|
; CHECK-NEXT: cset w14, ne
|
|
; CHECK-NEXT: cmp x1, #0
|
|
; CHECK-NEXT: umulh x15, x1, x4
|
|
; CHECK-NEXT: cset w16, ne
|
|
; CHECK-NEXT: mul x17, x5, x0
|
|
; CHECK-NEXT: and w14, w16, w14
|
|
; CHECK-NEXT: umulh x16, x5, x0
|
|
; CHECK-NEXT: cmp xzr, x15
|
|
; CHECK-NEXT: madd x15, x1, x4, x17
|
|
; CHECK-NEXT: cset w17, ne
|
|
; CHECK-NEXT: umulh x18, x0, x4
|
|
; CHECK-NEXT: cmp xzr, x16
|
|
; CHECK-NEXT: orr w14, w14, w17
|
|
; CHECK-NEXT: cset w16, ne
|
|
; CHECK-NEXT: adds x15, x18, x15
|
|
; CHECK-NEXT: orr w14, w14, w16
|
|
; CHECK-NEXT: cset w16, hs
|
|
; CHECK-NEXT: and w9, w12, w9
|
|
; CHECK-NEXT: orr w12, w14, w16
|
|
; CHECK-NEXT: orr w9, w9, w13
|
|
; CHECK-NEXT: orr w9, w9, w11
|
|
; CHECK-NEXT: mul x11, x0, x4
|
|
; CHECK-NEXT: orr w9, w9, w10
|
|
; CHECK-NEXT: ldr x10, [sp]
|
|
; CHECK-NEXT: fmov s0, w12
|
|
; CHECK-NEXT: stp x11, x15, [x10]
|
|
; CHECK-NEXT: mov v0.s[1], w9
|
|
; CHECK-NEXT: mul x9, x2, x6
|
|
; CHECK-NEXT: shl v0.2s, v0.2s, #31
|
|
; CHECK-NEXT: stp x9, x8, [x10, #16]
|
|
; CHECK-NEXT: sshr v0.2s, v0.2s, #31
|
|
; CHECK-NEXT: ret
|
|
%t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
|
|
%val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
|
|
%obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
|
|
%res = sext <2 x i1> %obit to <2 x i32>
|
|
store <2 x i128> %val, <2 x i128>* %p2
|
|
ret <2 x i32> %res
|
|
}
|