Files
clang-p2996/llvm/test/CodeGen/AArch64/vecreduce-add.ll
David Green 76ea5feb1f [AArch64] Combine concat(binop, binop) into binop(concat, concat) (#89911)
This generalizes the existing combine for concat(radd, radd) to any
binops. For much the same reason as the existing code, pushing the
concat up through the tree are hopefully quicker (or the same) as the
existing two half-vector operations, and can help combine away the
concat.
2024-04-25 12:12:49 +01:00

4993 lines
179 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-BASE
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-DOT
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT
define i32 @addv_v2i32(<2 x i32> %a) {
; CHECK-LABEL: addv_v2i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
ret i32 %arg1
}
define i16 @addv_v4i16(<4 x i16> %a) {
; CHECK-LABEL: addv_v4i16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv h0, v0.4h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
ret i16 %arg1
}
define i32 @add_v4i32_v4i32(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
ret i32 %z
}
define i8 @addv_v8i8(<8 x i8> %a) {
; CHECK-LABEL: addv_v8i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv b0, v0.8b
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
ret i8 %arg1
}
define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i64_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv d0, v0.4s
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv d0, v0.4s
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i32_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv s0, v0.8h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i32_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv s0, v0.8h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
; CHECK-SD-LABEL: add_v4i16_v4i32_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i32_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
; CHECK-SD-LABEL: add_v4i16_v4i32_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i32_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
; CHECK-SD-LABEL: add_v8i16_v8i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv h0, v0.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: uxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
ret i16 %z
}
define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
; CHECK-SD-LABEL: add_v8i16_v8i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.8h
; CHECK-GI-NEXT: mov w0, v0.s[0]
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
; CHECK-SD-LABEL: add_v8i16_v8i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.8h
; CHECK-GI-NEXT: smov x0, v0.s[0]
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
; CHECK-SD-LABEL: add_v4i16_v4i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: mov w0, v0.s[0]
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
; CHECK-SD-LABEL: add_v4i16_v4i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: smov x0, v0.s[0]
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
; CHECK-SD-LABEL: add_v2i16_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v2i16_v2i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: fmov x0, d0
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #48
; CHECK-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.16b, #1
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.16b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
ret i32 %z
}
define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.16b, #1
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.16b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: udot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
; CHECK-SD-LABEL: add_v4i8_v4i32_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i32_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
; CHECK-SD-LABEL: add_v4i8_v4i32_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24
; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i32_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
; CHECK-SD-LABEL: add_v16i8_v16i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlp v0.8h, v0.16b
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
ret i16 %z
}
define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
; CHECK-SD-LABEL: add_v16i8_v16i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlp v0.8h, v0.16b
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: smov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
ret i16 %z
}
define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
; CHECK-SD-LABEL: add_v8i8_v8i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
ret i16 %z
}
define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
; CHECK-SD-LABEL: add_v8i8_v8i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: smov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
ret i16 %z
}
define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
; CHECK-SD-LABEL: add_v16i8_v16i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: addv b0, v0.16b
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv b0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: uxtb w0, w8
; CHECK-GI-NEXT: ret
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
ret i8 %z
}
define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
; CHECK-SD-LABEL: add_v16i8_v16i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and x0, x8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
ret i64 %z
}
define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
; CHECK-SD-LABEL: add_v16i8_v16i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth x0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
; CHECK-SD-LABEL: add_v8i8_v8i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and x0, x8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
; CHECK-SD-LABEL: add_v8i8_v8i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth x0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
; CHECK-SD-LABEL: add_v4i8_v4i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and x0, x8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
; CHECK-SD-LABEL: add_v4i8_v4i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth x0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
; CHECK-SD-LABEL: add_v2i8_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v2i8_v2i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: fmov x0, d0
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #56
; CHECK-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i64_v2i64(<2 x i64> %x) {
; CHECK-LABEL: add_v2i64_v2i64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
ret i64 %z
}
define i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) {
; CHECK-LABEL: add_v4i32_v4i32_acc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
%r = add i32 %z, %a
ret i32 %r
}
define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv d0, v0.4s
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv d0, v0.4s
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv s0, v0.8h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv s0, v0.8h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
; CHECK-SD-LABEL: add_v4i16_v4i32_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i32_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w8, w0
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
; CHECK-SD-LABEL: add_v4i16_v4i32_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i32_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w8, w0
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
; CHECK-SD-LABEL: add_v8i16_v8i16_acc:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w8, w8, w0
; CHECK-SD-NEXT: and w0, w8, #0xffff
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i16_acc:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv h0, v0.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w8, w0, w8, uxth
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
%r = add i16 %z, %a
ret i16 %r
}
define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
; CHECK-SD-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, uxtw
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
; CHECK-SD-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, sxtw
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
; CHECK-SD-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, uxtw
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
; CHECK-SD-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, sxtw
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
; CHECK-SD-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: add x0, x8, x0
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #48
; CHECK-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.16b, #1
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w0, w8, uxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.16b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.16b, #1
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w0, w8, sxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.16b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: udot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w0, w8, uxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w0, w8, sxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
; CHECK-SD-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w0, w8, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
; CHECK-SD-LABEL: add_v4i8_v4i32_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24
; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i32_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w0, w8, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv h0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w8, w8, w0
; CHECK-NEXT: and w0, w8, #0xffff
; CHECK-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv h0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w8, w8, w0
; CHECK-NEXT: sxth w0, w8
; CHECK-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
; CHECK-SD-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w8, w8, w0
; CHECK-SD-NEXT: and w0, w8, #0xffff
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w8, w8, w0
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
; CHECK-SD-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w8, w8, w0
; CHECK-SD-NEXT: sxth w0, w8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w8, w8, w0
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
; CHECK-SD-LABEL: add_v16i8_v16i8_acc:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: addv b0, v0.16b
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w8, w8, w0
; CHECK-SD-NEXT: and w0, w8, #0xff
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i8_acc:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv b0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w8, w0, w8, uxtb
; CHECK-GI-NEXT: and w0, w8, #0xff
; CHECK-GI-NEXT: ret
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
%r = add i8 %z, %a
ret i8 %r
}
define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: add x0, x8, x0
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #56
; CHECK-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
; CHECK-LABEL: add_v2i64_v2i64_acc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v4i32_v4i32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i32_v4i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: addv s1, v1.4s
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
%z = add i32 %z1, %z2
ret i32 %z
}
define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv d0, v0.4s
; CHECK-GI-NEXT: uaddlv d1, v1.4s
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i32> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv d0, v0.4s
; CHECK-GI-NEXT: saddlv d1, v1.4s
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i32> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i32> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i32> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-NEXT: addv s0, v1.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.8h
; CHECK-GI-NEXT: uaddlv s1, v1.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = zext <8 x i16> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlp v1.4s, v1.8h
; CHECK-SD-NEXT: sadalp v1.4s, v0.8h
; CHECK-SD-NEXT: addv s0, v1.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.8h
; CHECK-GI-NEXT: saddlv s1, v1.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = sext <8 x i16> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: uaddlv s1, v1.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = zext <4 x i16> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: saddlv s1, v1.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = sext <4 x i16> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_udot_v8i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: umull v2.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v2.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v8i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v8i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: umull v2.4s, v1.4h, v0.4h
; CHECK-GI-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v2.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v8i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = zext <8 x i8> %a to <8 x i32>
%1 = zext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_udot_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: umull v4.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: umlal v4.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v4.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v16i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v2.4s, v1.16b, v0.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: umull v4.4s, v3.4h, v2.4h
; CHECK-GI-BASE-NEXT: umull v5.4s, v1.4h, v0.4h
; CHECK-GI-BASE-NEXT: umlal2 v4.4s, v3.8h, v2.8h
; CHECK-GI-BASE-NEXT: umlal2 v5.4s, v1.8h, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v4.4s, v5.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v16i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.4s, v1.16b, v0.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = zext <16 x i8> %a to <16 x i32>
%1 = zext <16 x i8> %b to <16 x i32>
%2 = mul nuw nsw <16 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
; CHECK-SD-BASE-LABEL: test_udot_v24i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ldr q0, [x0]
; CHECK-SD-BASE-NEXT: ldr q1, [x1]
; CHECK-SD-BASE-NEXT: ldr d4, [x0, #16]
; CHECK-SD-BASE-NEXT: ldr d5, [x1, #16]
; CHECK-SD-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: umull v6.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: ushll v3.8h, v4.8b, #0
; CHECK-SD-BASE-NEXT: ushll v4.8h, v5.8b, #0
; CHECK-SD-BASE-NEXT: umlal2 v2.4s, v4.8h, v3.8h
; CHECK-SD-BASE-NEXT: umlal v6.4s, v4.4h, v3.4h
; CHECK-SD-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: umlal v6.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v6.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v24i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q2, [x0]
; CHECK-SD-DOT-NEXT: ldr q3, [x1]
; CHECK-SD-DOT-NEXT: ldr d4, [x0, #16]
; CHECK-SD-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-SD-DOT-NEXT: udot v1.2s, v5.8b, v4.8b
; CHECK-SD-DOT-NEXT: udot v0.4s, v3.16b, v2.16b
; CHECK-SD-DOT-NEXT: addp v1.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w8, s1
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: add w0, w9, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v24i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ldr q0, [x0]
; CHECK-GI-BASE-NEXT: ldr q1, [x1]
; CHECK-GI-BASE-NEXT: ldr d2, [x0, #16]
; CHECK-GI-BASE-NEXT: ldr d3, [x1, #16]
; CHECK-GI-BASE-NEXT: ushll v4.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v5.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: ushll v3.8h, v3.8b, #0
; CHECK-GI-BASE-NEXT: umull v6.4s, v5.4h, v4.4h
; CHECK-GI-BASE-NEXT: umull2 v4.4s, v5.8h, v4.8h
; CHECK-GI-BASE-NEXT: umull2 v5.4s, v1.8h, v0.8h
; CHECK-GI-BASE-NEXT: umull v7.4s, v3.4h, v2.4h
; CHECK-GI-BASE-NEXT: umull v0.4s, v1.4h, v0.4h
; CHECK-GI-BASE-NEXT: umull2 v1.4s, v3.8h, v2.8h
; CHECK-GI-BASE-NEXT: addv s2, v6.4s
; CHECK-GI-BASE-NEXT: addv s3, v4.4s
; CHECK-GI-BASE-NEXT: addv s4, v5.4s
; CHECK-GI-BASE-NEXT: addv s5, v7.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s2
; CHECK-GI-BASE-NEXT: fmov w9, s3
; CHECK-GI-BASE-NEXT: fmov w10, s4
; CHECK-GI-BASE-NEXT: fmov w11, s5
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: add w10, w10, w11
; CHECK-GI-BASE-NEXT: fmov w11, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: add w9, w10, w11
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v24i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q2, [x0]
; CHECK-GI-DOT-NEXT: ldr d3, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr q4, [x1]
; CHECK-GI-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-GI-DOT-NEXT: udot v1.4s, v4.16b, v2.16b
; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v3.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <24 x i8>, ptr %p1
%b = load <24 x i8>, ptr %p2
%0 = zext <24 x i8> %a to <24 x i32>
%1 = zext <24 x i8> %b to <24 x i32>
%2 = mul nuw nsw <24 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
; CHECK-SD-BASE-LABEL: test_udot_v48i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ldp q0, q4, [x1]
; CHECK-SD-BASE-NEXT: ldr q2, [x0, #32]
; CHECK-SD-BASE-NEXT: ldp q1, q3, [x0]
; CHECK-SD-BASE-NEXT: ldr q7, [x1, #32]
; CHECK-SD-BASE-NEXT: ushll2 v16.8h, v2.16b, #0
; CHECK-SD-BASE-NEXT: ushll2 v6.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v17.8h, v7.16b, #0
; CHECK-SD-BASE-NEXT: ushll2 v5.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: umull2 v18.4s, v6.8h, v5.8h
; CHECK-SD-BASE-NEXT: umull v19.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: umull v5.4s, v6.4h, v5.4h
; CHECK-SD-BASE-NEXT: umull2 v0.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: ushll v1.8h, v2.8b, #0
; CHECK-SD-BASE-NEXT: ushll v2.8h, v7.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v6.8h, v3.16b, #0
; CHECK-SD-BASE-NEXT: ushll2 v7.8h, v4.16b, #0
; CHECK-SD-BASE-NEXT: umlal2 v18.4s, v17.8h, v16.8h
; CHECK-SD-BASE-NEXT: umlal v5.4s, v17.4h, v16.4h
; CHECK-SD-BASE-NEXT: umlal v19.4s, v2.4h, v1.4h
; CHECK-SD-BASE-NEXT: umlal2 v0.4s, v2.8h, v1.8h
; CHECK-SD-BASE-NEXT: ushll v1.8h, v3.8b, #0
; CHECK-SD-BASE-NEXT: ushll v2.8h, v4.8b, #0
; CHECK-SD-BASE-NEXT: umlal2 v18.4s, v7.8h, v6.8h
; CHECK-SD-BASE-NEXT: umlal v5.4s, v7.4h, v6.4h
; CHECK-SD-BASE-NEXT: umlal v19.4s, v2.4h, v1.4h
; CHECK-SD-BASE-NEXT: umlal2 v0.4s, v2.8h, v1.8h
; CHECK-SD-BASE-NEXT: add v1.4s, v19.4s, v5.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v18.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v48i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q1, [x0, #32]
; CHECK-SD-DOT-NEXT: ldr q2, [x1, #32]
; CHECK-SD-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: ldp q3, q1, [x0]
; CHECK-SD-DOT-NEXT: ldp q4, q2, [x1]
; CHECK-SD-DOT-NEXT: udot v0.4s, v4.16b, v3.16b
; CHECK-SD-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v48i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1]
; CHECK-GI-BASE-NEXT: ldr q6, [x0, #32]
; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0]
; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32]
; CHECK-GI-BASE-NEXT: ushll v20.8h, v6.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v6.16b, #0
; CHECK-GI-BASE-NEXT: ushll v4.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v16.8h, v3.8b, #0
; CHECK-GI-BASE-NEXT: ushll v5.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: ushll v17.8h, v2.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v3.8h, v3.16b, #0
; CHECK-GI-BASE-NEXT: ushll2 v2.8h, v2.16b, #0
; CHECK-GI-BASE-NEXT: umull v18.4s, v4.4h, v5.4h
; CHECK-GI-BASE-NEXT: umull2 v4.4s, v4.8h, v5.8h
; CHECK-GI-BASE-NEXT: umull v5.4s, v0.4h, v1.4h
; CHECK-GI-BASE-NEXT: umull2 v0.4s, v0.8h, v1.8h
; CHECK-GI-BASE-NEXT: umull v19.4s, v16.4h, v17.4h
; CHECK-GI-BASE-NEXT: ushll v1.8h, v7.8b, #0
; CHECK-GI-BASE-NEXT: umull2 v16.4s, v16.8h, v17.8h
; CHECK-GI-BASE-NEXT: umull v17.4s, v3.4h, v2.4h
; CHECK-GI-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h
; CHECK-GI-BASE-NEXT: ushll2 v7.8h, v7.16b, #0
; CHECK-GI-BASE-NEXT: addv s18, v18.4s
; CHECK-GI-BASE-NEXT: addv s4, v4.4s
; CHECK-GI-BASE-NEXT: addv s5, v5.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s19, v19.4s
; CHECK-GI-BASE-NEXT: umull v3.4s, v1.4h, v20.4h
; CHECK-GI-BASE-NEXT: addv s2, v2.4s
; CHECK-GI-BASE-NEXT: umull2 v1.4s, v1.8h, v20.8h
; CHECK-GI-BASE-NEXT: umull v20.4s, v7.4h, v6.4h
; CHECK-GI-BASE-NEXT: fmov w8, s18
; CHECK-GI-BASE-NEXT: fmov w9, s4
; CHECK-GI-BASE-NEXT: fmov w10, s5
; CHECK-GI-BASE-NEXT: fmov w11, s0
; CHECK-GI-BASE-NEXT: fmov w12, s19
; CHECK-GI-BASE-NEXT: addv s4, v16.4s
; CHECK-GI-BASE-NEXT: addv s5, v17.4s
; CHECK-GI-BASE-NEXT: addv s3, v3.4s
; CHECK-GI-BASE-NEXT: umull2 v0.4s, v7.8h, v6.8h
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: add w9, w11, w12
; CHECK-GI-BASE-NEXT: add w8, w8, w10
; CHECK-GI-BASE-NEXT: fmov w10, s4
; CHECK-GI-BASE-NEXT: fmov w11, s5
; CHECK-GI-BASE-NEXT: fmov w12, s2
; CHECK-GI-BASE-NEXT: addv s4, v20.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: add w9, w9, w10
; CHECK-GI-BASE-NEXT: add w10, w11, w12
; CHECK-GI-BASE-NEXT: fmov w11, s3
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: add w9, w10, w11
; CHECK-GI-BASE-NEXT: fmov w10, s1
; CHECK-GI-BASE-NEXT: fmov w11, s0
; CHECK-GI-BASE-NEXT: add w9, w9, w10
; CHECK-GI-BASE-NEXT: fmov w10, s4
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: add w9, w10, w11
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32]
; CHECK-GI-DOT-NEXT: ldp q3, q4, [x0]
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldp q5, q6, [x1]
; CHECK-GI-DOT-NEXT: ldr q16, [x1, #32]
; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v3.16b
; CHECK-GI-DOT-NEXT: udot v1.4s, v6.16b, v4.16b
; CHECK-GI-DOT-NEXT: udot v2.4s, v16.16b, v7.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: addv s2, v2.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w8, w9
; CHECK-GI-DOT-NEXT: fmov w9, s2
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <48 x i8>, ptr %p1
%b = load <48 x i8>, ptr %p2
%0 = zext <48 x i8> %a to <48 x i32>
%1 = zext <48 x i8> %b to <48 x i32>
%2 = mul nuw nsw <48 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_sdot_v8i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: smull v2.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v2.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v8i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v2.2s, v1.8b, v0.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_sdot_v8i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: smull v2.4s, v1.4h, v0.4h
; CHECK-GI-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v2.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v8i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.2s, v1.8b, v0.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = sext <8 x i8> %a to <8 x i32>
%1 = sext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_sdot_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: smull v4.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: smlal v4.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v4.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v16i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v2.4s, v1.16b, v0.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_sdot_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: smull v4.4s, v3.4h, v2.4h
; CHECK-GI-BASE-NEXT: smull v5.4s, v1.4h, v0.4h
; CHECK-GI-BASE-NEXT: smlal2 v4.4s, v3.8h, v2.8h
; CHECK-GI-BASE-NEXT: smlal2 v5.4s, v1.8h, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v4.4s, v5.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v16i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.4s, v1.16b, v0.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = sext <16 x i8> %a to <16 x i32>
%1 = sext <16 x i8> %b to <16 x i32>
%2 = mul nuw nsw <16 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
; CHECK-SD-BASE-LABEL: test_sdot_v24i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ldr q0, [x0]
; CHECK-SD-BASE-NEXT: ldr q1, [x1]
; CHECK-SD-BASE-NEXT: ldr d4, [x0, #16]
; CHECK-SD-BASE-NEXT: ldr d5, [x1, #16]
; CHECK-SD-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: smull v6.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: sshll v3.8h, v4.8b, #0
; CHECK-SD-BASE-NEXT: sshll v4.8h, v5.8b, #0
; CHECK-SD-BASE-NEXT: smlal2 v2.4s, v4.8h, v3.8h
; CHECK-SD-BASE-NEXT: smlal v6.4s, v4.4h, v3.4h
; CHECK-SD-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: smlal v6.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v6.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v24i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q2, [x0]
; CHECK-SD-DOT-NEXT: ldr q3, [x1]
; CHECK-SD-DOT-NEXT: ldr d4, [x0, #16]
; CHECK-SD-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-SD-DOT-NEXT: sdot v1.2s, v5.8b, v4.8b
; CHECK-SD-DOT-NEXT: sdot v0.4s, v3.16b, v2.16b
; CHECK-SD-DOT-NEXT: addp v1.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w8, s1
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: add w0, w9, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_sdot_v24i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ldr q0, [x0]
; CHECK-GI-BASE-NEXT: ldr q1, [x1]
; CHECK-GI-BASE-NEXT: ldr d2, [x0, #16]
; CHECK-GI-BASE-NEXT: ldr d3, [x1, #16]
; CHECK-GI-BASE-NEXT: sshll v4.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v5.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v3.8b, #0
; CHECK-GI-BASE-NEXT: smull v6.4s, v5.4h, v4.4h
; CHECK-GI-BASE-NEXT: smull2 v4.4s, v5.8h, v4.8h
; CHECK-GI-BASE-NEXT: smull2 v5.4s, v1.8h, v0.8h
; CHECK-GI-BASE-NEXT: smull v7.4s, v3.4h, v2.4h
; CHECK-GI-BASE-NEXT: smull v0.4s, v1.4h, v0.4h
; CHECK-GI-BASE-NEXT: smull2 v1.4s, v3.8h, v2.8h
; CHECK-GI-BASE-NEXT: addv s2, v6.4s
; CHECK-GI-BASE-NEXT: addv s3, v4.4s
; CHECK-GI-BASE-NEXT: addv s4, v5.4s
; CHECK-GI-BASE-NEXT: addv s5, v7.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s2
; CHECK-GI-BASE-NEXT: fmov w9, s3
; CHECK-GI-BASE-NEXT: fmov w10, s4
; CHECK-GI-BASE-NEXT: fmov w11, s5
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: add w10, w10, w11
; CHECK-GI-BASE-NEXT: fmov w11, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: add w9, w10, w11
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q2, [x0]
; CHECK-GI-DOT-NEXT: ldr d3, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr q4, [x1]
; CHECK-GI-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-GI-DOT-NEXT: sdot v1.4s, v4.16b, v2.16b
; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v3.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <24 x i8>, ptr %p1
%b = load <24 x i8>, ptr %p2
%0 = sext <24 x i8> %a to <24 x i32>
%1 = sext <24 x i8> %b to <24 x i32>
%2 = mul nuw nsw <24 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
; CHECK-SD-BASE-LABEL: test_sdot_v48i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ldp q0, q4, [x1]
; CHECK-SD-BASE-NEXT: ldr q2, [x0, #32]
; CHECK-SD-BASE-NEXT: ldp q1, q3, [x0]
; CHECK-SD-BASE-NEXT: ldr q7, [x1, #32]
; CHECK-SD-BASE-NEXT: sshll2 v16.8h, v2.16b, #0
; CHECK-SD-BASE-NEXT: sshll2 v6.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v17.8h, v7.16b, #0
; CHECK-SD-BASE-NEXT: sshll2 v5.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: smull2 v18.4s, v6.8h, v5.8h
; CHECK-SD-BASE-NEXT: smull v19.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: smull v5.4s, v6.4h, v5.4h
; CHECK-SD-BASE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: sshll v1.8h, v2.8b, #0
; CHECK-SD-BASE-NEXT: sshll v2.8h, v7.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v6.8h, v3.16b, #0
; CHECK-SD-BASE-NEXT: sshll2 v7.8h, v4.16b, #0
; CHECK-SD-BASE-NEXT: smlal2 v18.4s, v17.8h, v16.8h
; CHECK-SD-BASE-NEXT: smlal v5.4s, v17.4h, v16.4h
; CHECK-SD-BASE-NEXT: smlal v19.4s, v2.4h, v1.4h
; CHECK-SD-BASE-NEXT: smlal2 v0.4s, v2.8h, v1.8h
; CHECK-SD-BASE-NEXT: sshll v1.8h, v3.8b, #0
; CHECK-SD-BASE-NEXT: sshll v2.8h, v4.8b, #0
; CHECK-SD-BASE-NEXT: smlal2 v18.4s, v7.8h, v6.8h
; CHECK-SD-BASE-NEXT: smlal v5.4s, v7.4h, v6.4h
; CHECK-SD-BASE-NEXT: smlal v19.4s, v2.4h, v1.4h
; CHECK-SD-BASE-NEXT: smlal2 v0.4s, v2.8h, v1.8h
; CHECK-SD-BASE-NEXT: add v1.4s, v19.4s, v5.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v18.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v48i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q1, [x0, #32]
; CHECK-SD-DOT-NEXT: ldr q2, [x1, #32]
; CHECK-SD-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: ldp q3, q1, [x0]
; CHECK-SD-DOT-NEXT: ldp q4, q2, [x1]
; CHECK-SD-DOT-NEXT: sdot v0.4s, v4.16b, v3.16b
; CHECK-SD-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1]
; CHECK-GI-BASE-NEXT: ldr q6, [x0, #32]
; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0]
; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32]
; CHECK-GI-BASE-NEXT: sshll v20.8h, v6.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v6.16b, #0
; CHECK-GI-BASE-NEXT: sshll v4.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v16.8h, v3.8b, #0
; CHECK-GI-BASE-NEXT: sshll v5.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: sshll v17.8h, v2.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v3.8h, v3.16b, #0
; CHECK-GI-BASE-NEXT: sshll2 v2.8h, v2.16b, #0
; CHECK-GI-BASE-NEXT: smull v18.4s, v4.4h, v5.4h
; CHECK-GI-BASE-NEXT: smull2 v4.4s, v4.8h, v5.8h
; CHECK-GI-BASE-NEXT: smull v5.4s, v0.4h, v1.4h
; CHECK-GI-BASE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-GI-BASE-NEXT: smull v19.4s, v16.4h, v17.4h
; CHECK-GI-BASE-NEXT: sshll v1.8h, v7.8b, #0
; CHECK-GI-BASE-NEXT: smull2 v16.4s, v16.8h, v17.8h
; CHECK-GI-BASE-NEXT: smull v17.4s, v3.4h, v2.4h
; CHECK-GI-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h
; CHECK-GI-BASE-NEXT: sshll2 v7.8h, v7.16b, #0
; CHECK-GI-BASE-NEXT: addv s18, v18.4s
; CHECK-GI-BASE-NEXT: addv s4, v4.4s
; CHECK-GI-BASE-NEXT: addv s5, v5.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s19, v19.4s
; CHECK-GI-BASE-NEXT: smull v3.4s, v1.4h, v20.4h
; CHECK-GI-BASE-NEXT: addv s2, v2.4s
; CHECK-GI-BASE-NEXT: smull2 v1.4s, v1.8h, v20.8h
; CHECK-GI-BASE-NEXT: smull v20.4s, v7.4h, v6.4h
; CHECK-GI-BASE-NEXT: fmov w8, s18
; CHECK-GI-BASE-NEXT: fmov w9, s4
; CHECK-GI-BASE-NEXT: fmov w10, s5
; CHECK-GI-BASE-NEXT: fmov w11, s0
; CHECK-GI-BASE-NEXT: fmov w12, s19
; CHECK-GI-BASE-NEXT: addv s4, v16.4s
; CHECK-GI-BASE-NEXT: addv s5, v17.4s
; CHECK-GI-BASE-NEXT: addv s3, v3.4s
; CHECK-GI-BASE-NEXT: smull2 v0.4s, v7.8h, v6.8h
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: add w9, w11, w12
; CHECK-GI-BASE-NEXT: add w8, w8, w10
; CHECK-GI-BASE-NEXT: fmov w10, s4
; CHECK-GI-BASE-NEXT: fmov w11, s5
; CHECK-GI-BASE-NEXT: fmov w12, s2
; CHECK-GI-BASE-NEXT: addv s4, v20.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: add w9, w9, w10
; CHECK-GI-BASE-NEXT: add w10, w11, w12
; CHECK-GI-BASE-NEXT: fmov w11, s3
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: add w9, w10, w11
; CHECK-GI-BASE-NEXT: fmov w10, s1
; CHECK-GI-BASE-NEXT: fmov w11, s0
; CHECK-GI-BASE-NEXT: add w9, w9, w10
; CHECK-GI-BASE-NEXT: fmov w10, s4
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: add w9, w10, w11
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32]
; CHECK-GI-DOT-NEXT: ldp q3, q4, [x0]
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldp q5, q6, [x1]
; CHECK-GI-DOT-NEXT: ldr q16, [x1, #32]
; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v3.16b
; CHECK-GI-DOT-NEXT: sdot v1.4s, v6.16b, v4.16b
; CHECK-GI-DOT-NEXT: sdot v2.4s, v16.16b, v7.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: addv s2, v2.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w8, w9
; CHECK-GI-DOT-NEXT: fmov w9, s2
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <48 x i8>, ptr %p1
%b = load <48 x i8>, ptr %p2
%0 = sext <48 x i8> %a to <48 x i32>
%1 = sext <48 x i8> %b to <48 x i32>
%2 = mul nuw nsw <48 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
ret i32 %3
}
; Test to ensure that if G_MUL has more than 1 use, it should not be combined to UDOT
define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_udot_v8i8_multi_use:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: umull v2.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: mov v3.16b, v2.16b
; CHECK-SD-BASE-NEXT: fmov w8, s2
; CHECK-SD-BASE-NEXT: umlal2 v3.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v3.4s
; CHECK-SD-BASE-NEXT: fmov w9, s0
; CHECK-SD-BASE-NEXT: add w0, w9, w8
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ushll v3.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll v4.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-SD-DOT-NEXT: umull v0.4s, v4.4h, v3.4h
; CHECK-SD-DOT-NEXT: addp v1.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: fmov w8, s1
; CHECK-SD-DOT-NEXT: add w0, w8, w9
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-LABEL: test_udot_v8i8_multi_use:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-NEXT: umull v2.4s, v1.4h, v0.4h
; CHECK-GI-NEXT: mov v3.16b, v2.16b
; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: umlal2 v3.4s, v1.8h, v0.8h
; CHECK-GI-NEXT: addv s0, v3.4s
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add w0, w9, w8
; CHECK-GI-NEXT: ret
entry:
%0 = zext <8 x i8> %a to <8 x i32>
%1 = zext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
%4 = extractelement <8 x i32> %2, i32 0
%5 = add nuw nsw i32 %3, %4
ret i32 %5
}
define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v8i16_v8i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v8i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv h0, v0.8h
; CHECK-GI-NEXT: addv h1, v1.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w9, w8, uxth
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
%z = add i16 %z1, %z2
ret i16 %z
}
define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s1, v1.8h
; CHECK-GI-NEXT: uaddlv s0, v0.8h
; CHECK-GI-NEXT: mov w8, v1.s[0]
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add x0, x8, w9, uxtw
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = zext <8 x i16> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s1, v1.8h
; CHECK-GI-NEXT: saddlv s0, v0.8h
; CHECK-GI-NEXT: smov x8, v1.s[0]
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add x0, x8, w9, sxtw
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = sext <8 x i16> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s1, v1.4h
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: mov w8, v1.s[0]
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add x0, x8, w9, uxtw
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i16> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s1, v1.4h
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: smov x8, v1.s[0]
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add x0, x8, w9, sxtw
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i16> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: movi v2.2d, #0x00ffff0000ffff
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v2.2d, #0x0000000000ffff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i16> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #48
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #48
; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-SD-NEXT: ssra v0.2d, v1.2d, #48
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: shl v0.2d, v0.2d, #48
; CHECK-GI-NEXT: shl v1.2d, v1.2d, #48
; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #48
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i16> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v0.8h, v2.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v2.4h
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v1.8h, v3.8h
; CHECK-SD-BASE-NEXT: uaddl v1.4s, v1.4h, v3.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.16b
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: and w8, w8, #0xffff
; CHECK-GI-BASE-NEXT: add w0, w8, w9, uxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addv s0, v4.4s
; CHECK-GI-DOT-NEXT: addv s1, v3.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%yy = zext <16 x i8> %y to <16 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v0.8h, v2.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v2.4h
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v1.8h, v3.8h
; CHECK-SD-BASE-NEXT: saddl v1.4s, v1.4h, v3.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h1, v1.16b
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: sxth w8, w8
; CHECK-GI-BASE-NEXT: add w0, w8, w9, sxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addv s0, v4.4s
; CHECK-GI-DOT-NEXT: addv s1, v3.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%yy = sext <16 x i8> %y to <16 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v3.8b, #1
; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v3.8b
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: and w8, w8, #0xffff
; CHECK-GI-BASE-NEXT: add w0, w8, w9, uxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v4.2s, v0.8b, v2.8b
; CHECK-GI-DOT-NEXT: udot v3.2s, v1.8b, v2.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v3.2s, v3.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = zext <8 x i8> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: sadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v3.8b, #1
; CHECK-SD-DOT-NEXT: sdot v2.2s, v1.8b, v3.8b
; CHECK-SD-DOT-NEXT: sdot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: saddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: sxth w8, w8
; CHECK-GI-BASE-NEXT: add w0, w8, w9, sxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v4.2s, v0.8b, v2.8b
; CHECK-GI-DOT-NEXT: sdot v3.2s, v1.8b, v2.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v3.2s, v3.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = sext <8 x i8> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-GI-NEXT: uaddlv s1, v1.4h
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: and w8, w8, #0xffff
; CHECK-GI-NEXT: add w0, w8, w9, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = zext <4 x i8> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24
; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24
; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-SD-NEXT: ssra v0.4s, v1.4s, #24
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s1, v1.4h
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: sxth w8, w8
; CHECK-GI-NEXT: add w0, w8, w9, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = sext <4 x i8> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlp v1.8h, v1.16b
; CHECK-SD-NEXT: uadalp v1.8h, v0.16b
; CHECK-SD-NEXT: addv h0, v1.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: uaddlv h1, v1.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%yy = zext <16 x i8> %y to <16 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlp v1.8h, v1.16b
; CHECK-SD-NEXT: sadalp v1.8h, v0.16b
; CHECK-SD-NEXT: addv h0, v1.8h
; CHECK-SD-NEXT: smov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: saddlv h1, v1.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%yy = sext <16 x i8> %y to <16 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: uaddlv h0, v0.16b
; CHECK-SD-NEXT: umov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: uaddlv h1, v1.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%yy = zext <8 x i8> %y to <8 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: smov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: saddlv h1, v1.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%yy = sext <8 x i8> %y to <8 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v16i8_v16i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: add v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: addv b0, v0.16b
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v16i8_v16i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv b0, v0.16b
; CHECK-GI-NEXT: addv b1, v1.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w9, w8, uxtb
; CHECK-GI-NEXT: and w0, w8, #0xff
; CHECK-GI-NEXT: ret
entry:
%z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
%z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
%z = add i8 %z1, %z2
ret i8 %z
}
define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-SD-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-SD-NEXT: ushll2 v5.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll2 v6.4s, v3.8h, #0
; CHECK-SD-NEXT: ushll2 v7.4s, v1.8h, #0
; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: uaddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-NEXT: uaddl v2.2d, v5.2s, v2.2s
; CHECK-SD-NEXT: uaddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v4.2s
; CHECK-SD-NEXT: uaddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-NEXT: uaddl v6.2d, v7.2s, v6.2s
; CHECK-SD-NEXT: uaddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h1, v1.16b
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: and x8, x8, #0xffff
; CHECK-GI-NEXT: add x0, x8, w9, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%yy = zext <16 x i8> %y to <16 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll2 v3.8h, v1.16b, #0
; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-SD-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-SD-NEXT: sshll2 v5.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: sshll2 v6.4s, v3.8h, #0
; CHECK-SD-NEXT: sshll2 v7.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll v3.4s, v3.4h, #0
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: saddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-NEXT: saddl v2.2d, v5.2s, v2.2s
; CHECK-SD-NEXT: saddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v4.2s
; CHECK-SD-NEXT: saddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-NEXT: saddl v6.2d, v7.2s, v6.2s
; CHECK-SD-NEXT: saddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h1, v1.16b
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: sxth x8, w8
; CHECK-GI-NEXT: add x0, x8, w9, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%yy = sext <16 x i8> %y to <16 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h1, v1.8b
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: and x8, x8, #0xffff
; CHECK-GI-NEXT: add x0, x8, w9, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = zext <8 x i8> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h1, v1.8b
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: sxth x8, w8
; CHECK-GI-NEXT: add x0, x8, w9, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = sext <8 x i8> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-GI-NEXT: uaddlv s1, v1.4h
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: and x8, x8, #0xffff
; CHECK-GI-NEXT: add x0, x8, w9, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i8> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-SD-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-SD-NEXT: shl v3.2d, v3.2d, #56
; CHECK-SD-NEXT: shl v2.2d, v2.2d, #56
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-SD-NEXT: sshr v2.2d, v2.2d, #56
; CHECK-SD-NEXT: ssra v3.2d, v0.2d, #56
; CHECK-SD-NEXT: ssra v2.2d, v1.2d, #56
; CHECK-SD-NEXT: add v0.2d, v3.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s1, v1.4h
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: sxth x8, w8
; CHECK-GI-NEXT: add x0, x8, w9, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i8> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: movi v2.2d, #0x0000ff000000ff
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i8> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-SD-NEXT: ssra v0.2d, v1.2d, #56
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i8> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v3.8h, v3.8b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: saddlp v3.4s, v3.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: sadalp v3.4s, v2.8h
; CHECK-SD-BASE-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v5.8b, #1
; CHECK-SD-DOT-NEXT: movi v6.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v6.2s, v1.8b, v5.8b
; CHECK-SD-DOT-NEXT: sdot v4.2s, v3.8b, v5.8b
; CHECK-SD-DOT-NEXT: udot v6.2s, v0.8b, v5.8b
; CHECK-SD-DOT-NEXT: sdot v4.2s, v2.8b, v5.8b
; CHECK-SD-DOT-NEXT: add v0.2s, v6.2s, v4.2s
; CHECK-SD-DOT-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h3, v3.8b
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: saddlv h2, v2.8b
; CHECK-GI-BASE-NEXT: fmov w8, s3
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: fmov w10, s0
; CHECK-GI-BASE-NEXT: fmov w11, s2
; CHECK-GI-BASE-NEXT: sxth w8, w8
; CHECK-GI-BASE-NEXT: and w9, w9, #0xffff
; CHECK-GI-BASE-NEXT: add w9, w9, w10, uxth
; CHECK-GI-BASE-NEXT: add w8, w8, w11, sxth
; CHECK-GI-BASE-NEXT: add w0, w9, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v4.8b, #1
; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v6.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v5.2s, v0.8b, v4.8b
; CHECK-GI-DOT-NEXT: sdot v6.2s, v3.8b, v4.8b
; CHECK-GI-DOT-NEXT: udot v7.2s, v1.8b, v4.8b
; CHECK-GI-DOT-NEXT: sdot v16.2s, v2.8b, v4.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v5.2s, v5.2s
; CHECK-GI-DOT-NEXT: addp v3.2s, v6.2s, v6.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v7.2s, v7.2s
; CHECK-GI-DOT-NEXT: addp v2.2s, v16.2s, v16.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w11, s3
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: fmov w10, s2
; CHECK-GI-DOT-NEXT: add w8, w8, w9
; CHECK-GI-DOT-NEXT: add w9, w10, w11
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%axx = zext <8 x i8> %ax to <8 x i32>
%az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
%ayy = zext <8 x i8> %ay to <8 x i32>
%az2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %ayy)
%az = add i32 %az1, %az2
%bxx = sext <8 x i8> %bx to <8 x i32>
%bz1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bxx)
%byy = sext <8 x i8> %by to <8 x i32>
%bz2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %byy)
%bz = add i32 %bz1, %bz2
%z = add i32 %az, %bz
ret i32 %z
}
define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
; CHECK-SD-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-NEXT: uaddlp v3.4s, v3.8h
; CHECK-SD-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-NEXT: uadalp v3.4s, v2.8h
; CHECK-SD-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0
; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-NEXT: ushll v5.4s, v1.4h, #0
; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0
; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-GI-NEXT: ushll v7.4s, v3.4h, #0
; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-GI-NEXT: add v0.4s, v4.4s, v0.4s
; CHECK-GI-NEXT: add v1.4s, v5.4s, v1.4s
; CHECK-GI-NEXT: add v2.4s, v6.4s, v2.4s
; CHECK-GI-NEXT: add v3.4s, v7.4s, v3.4s
; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%axx = zext <8 x i16> %ax to <8 x i32>
%s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s1l = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%axs = add <4 x i32> %s1h, %s1l
%ayy = zext <8 x i16> %ay to <8 x i32>
%s2h = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s2l = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%ays = add <4 x i32> %s2h, %s2l
%az = add <4 x i32> %axs, %ays
%bxx = zext <8 x i16> %bx to <8 x i32>
%s3h = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s3l = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%bxs = add <4 x i32> %s3h, %s3l
%byy = zext <8 x i16> %by to <8 x i32>
%s4h = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s4l = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%bys = add <4 x i32> %s4h, %s4l
%bz = add <4 x i32> %bxs, %bys
%z = add <4 x i32> %az, %bz
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z)
ret i32 %z2
}
define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-SD-LABEL: add_pair_v2i64_v2i64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i64_v2i64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
%z = add i64 %z1, %z2
ret i64 %z
}
; Irregularly sized vectors
define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
; CHECK-SD-LABEL: add_v24i8_v24i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fmov s0, w0
; CHECK-SD-NEXT: ldr b1, [sp, #64]
; CHECK-SD-NEXT: add x8, sp, #72
; CHECK-SD-NEXT: ldr b2, [sp]
; CHECK-SD-NEXT: add x9, sp, #80
; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8]
; CHECK-SD-NEXT: add x8, sp, #8
; CHECK-SD-NEXT: mov v0.b[1], w1
; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8]
; CHECK-SD-NEXT: add x8, sp, #16
; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-NEXT: add x9, sp, #88
; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8]
; CHECK-SD-NEXT: add x8, sp, #24
; CHECK-SD-NEXT: mov v0.b[2], w2
; CHECK-SD-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-NEXT: add x9, sp, #96
; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8]
; CHECK-SD-NEXT: add x8, sp, #32
; CHECK-SD-NEXT: mov v0.b[3], w3
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: add x9, sp, #104
; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8]
; CHECK-SD-NEXT: add x8, sp, #40
; CHECK-SD-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-NEXT: add x9, sp, #112
; CHECK-SD-NEXT: mov v0.b[4], w4
; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8]
; CHECK-SD-NEXT: add x8, sp, #48
; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-NEXT: add x9, sp, #120
; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8]
; CHECK-SD-NEXT: add x8, sp, #56
; CHECK-SD-NEXT: mov v0.b[5], w5
; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8]
; CHECK-SD-NEXT: mov v0.b[6], w6
; CHECK-SD-NEXT: mov v0.b[7], w7
; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v2.8b
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v24i8_v24i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov s4, w0
; CHECK-GI-NEXT: fmov s5, w4
; CHECK-GI-NEXT: ldr s0, [sp]
; CHECK-GI-NEXT: ldr s6, [sp, #8]
; CHECK-GI-NEXT: ldr s1, [sp, #32]
; CHECK-GI-NEXT: ldr s7, [sp, #40]
; CHECK-GI-NEXT: ldr s2, [sp, #64]
; CHECK-GI-NEXT: ldr s16, [sp, #72]
; CHECK-GI-NEXT: ldr s3, [sp, #96]
; CHECK-GI-NEXT: ldr s17, [sp, #104]
; CHECK-GI-NEXT: mov v4.s[1], w1
; CHECK-GI-NEXT: mov v5.s[1], w5
; CHECK-GI-NEXT: mov v0.s[1], v6.s[0]
; CHECK-GI-NEXT: mov v1.s[1], v7.s[0]
; CHECK-GI-NEXT: mov v2.s[1], v16.s[0]
; CHECK-GI-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #16]
; CHECK-GI-NEXT: ldr s7, [sp, #48]
; CHECK-GI-NEXT: ldr s16, [sp, #80]
; CHECK-GI-NEXT: ldr s17, [sp, #112]
; CHECK-GI-NEXT: mov v4.s[2], w2
; CHECK-GI-NEXT: mov v5.s[2], w6
; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v7.s[0]
; CHECK-GI-NEXT: mov v2.s[2], v16.s[0]
; CHECK-GI-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #24]
; CHECK-GI-NEXT: ldr s7, [sp, #56]
; CHECK-GI-NEXT: ldr s16, [sp, #88]
; CHECK-GI-NEXT: ldr s17, [sp, #120]
; CHECK-GI-NEXT: mov v4.s[3], w3
; CHECK-GI-NEXT: mov v5.s[3], w7
; CHECK-GI-NEXT: mov v0.s[3], v6.s[0]
; CHECK-GI-NEXT: mov v1.s[3], v7.s[0]
; CHECK-GI-NEXT: mov v2.s[3], v16.s[0]
; CHECK-GI-NEXT: mov v3.s[3], v17.s[0]
; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v3.8h
; CHECK-GI-NEXT: uzp1 v0.16b, v4.16b, v0.16b
; CHECK-GI-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: uaddlv h1, v1.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <24 x i8> %x to <24 x i16>
%z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx)
ret i16 %z
}
define i16 @add_v32i8_v32i16_zext(<32 x i8> %x) {
; CHECK-SD-LABEL: add_v32i8_v32i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddl2 v2.8h, v0.16b, v1.16b
; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v32i8_v32i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: uaddlv h1, v1.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <32 x i8> %x to <32 x i16>
%z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx)
ret i16 %z
}
define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
; CHECK-SD-LABEL: add_v24i8_v24i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fmov s0, w0
; CHECK-SD-NEXT: ldr b1, [sp, #64]
; CHECK-SD-NEXT: add x8, sp, #72
; CHECK-SD-NEXT: ldr b2, [sp]
; CHECK-SD-NEXT: add x9, sp, #80
; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8]
; CHECK-SD-NEXT: add x8, sp, #8
; CHECK-SD-NEXT: mov v0.b[1], w1
; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8]
; CHECK-SD-NEXT: add x8, sp, #16
; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-NEXT: add x9, sp, #88
; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8]
; CHECK-SD-NEXT: add x8, sp, #24
; CHECK-SD-NEXT: mov v0.b[2], w2
; CHECK-SD-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-NEXT: add x9, sp, #96
; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8]
; CHECK-SD-NEXT: add x8, sp, #32
; CHECK-SD-NEXT: mov v0.b[3], w3
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: add x9, sp, #104
; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8]
; CHECK-SD-NEXT: add x8, sp, #40
; CHECK-SD-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-NEXT: add x9, sp, #112
; CHECK-SD-NEXT: mov v0.b[4], w4
; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8]
; CHECK-SD-NEXT: add x8, sp, #48
; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-NEXT: add x9, sp, #120
; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8]
; CHECK-SD-NEXT: add x8, sp, #56
; CHECK-SD-NEXT: mov v0.b[5], w5
; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8]
; CHECK-SD-NEXT: mov v0.b[6], w6
; CHECK-SD-NEXT: mov v0.b[7], w7
; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: saddw v0.8h, v0.8h, v2.8b
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v24i8_v24i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov s4, w0
; CHECK-GI-NEXT: fmov s5, w4
; CHECK-GI-NEXT: ldr s0, [sp]
; CHECK-GI-NEXT: ldr s6, [sp, #8]
; CHECK-GI-NEXT: ldr s1, [sp, #32]
; CHECK-GI-NEXT: ldr s7, [sp, #40]
; CHECK-GI-NEXT: ldr s2, [sp, #64]
; CHECK-GI-NEXT: ldr s16, [sp, #72]
; CHECK-GI-NEXT: ldr s3, [sp, #96]
; CHECK-GI-NEXT: ldr s17, [sp, #104]
; CHECK-GI-NEXT: mov v4.s[1], w1
; CHECK-GI-NEXT: mov v5.s[1], w5
; CHECK-GI-NEXT: mov v0.s[1], v6.s[0]
; CHECK-GI-NEXT: mov v1.s[1], v7.s[0]
; CHECK-GI-NEXT: mov v2.s[1], v16.s[0]
; CHECK-GI-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #16]
; CHECK-GI-NEXT: ldr s7, [sp, #48]
; CHECK-GI-NEXT: ldr s16, [sp, #80]
; CHECK-GI-NEXT: ldr s17, [sp, #112]
; CHECK-GI-NEXT: mov v4.s[2], w2
; CHECK-GI-NEXT: mov v5.s[2], w6
; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v7.s[0]
; CHECK-GI-NEXT: mov v2.s[2], v16.s[0]
; CHECK-GI-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #24]
; CHECK-GI-NEXT: ldr s7, [sp, #56]
; CHECK-GI-NEXT: ldr s16, [sp, #88]
; CHECK-GI-NEXT: ldr s17, [sp, #120]
; CHECK-GI-NEXT: mov v4.s[3], w3
; CHECK-GI-NEXT: mov v5.s[3], w7
; CHECK-GI-NEXT: mov v0.s[3], v6.s[0]
; CHECK-GI-NEXT: mov v1.s[3], v7.s[0]
; CHECK-GI-NEXT: mov v2.s[3], v16.s[0]
; CHECK-GI-NEXT: mov v3.s[3], v17.s[0]
; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v3.8h
; CHECK-GI-NEXT: uzp1 v0.16b, v4.16b, v0.16b
; CHECK-GI-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: saddlv h1, v1.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <24 x i8> %x to <24 x i16>
%z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx)
ret i16 %z
}
define i16 @add_v32i8_v32i16_sext(<32 x i8> %x) {
; CHECK-SD-LABEL: add_v32i8_v32i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddl2 v2.8h, v0.16b, v1.16b
; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v32i8_v32i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: saddlv h1, v1.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <32 x i8> %x to <32 x i16>
%z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx)
ret i16 %z
}
; Irregularly sized vectors and larger extends
define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: fmov s0, w0
; CHECK-SD-BASE-NEXT: ldr b1, [sp, #64]
; CHECK-SD-BASE-NEXT: add x8, sp, #72
; CHECK-SD-BASE-NEXT: ldr b2, [sp]
; CHECK-SD-BASE-NEXT: add x9, sp, #80
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[1], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #8
; CHECK-SD-BASE-NEXT: mov v0.b[1], w1
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #16
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #88
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #24
; CHECK-SD-BASE-NEXT: mov v0.b[2], w2
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #96
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #32
; CHECK-SD-BASE-NEXT: mov v0.b[3], w3
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #104
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #40
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #112
; CHECK-SD-BASE-NEXT: mov v0.b[4], w4
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #48
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #120
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #56
; CHECK-SD-BASE-NEXT: mov v0.b[5], w5
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[7], [x8]
; CHECK-SD-BASE-NEXT: mov v0.b[6], w6
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-SD-BASE-NEXT: mov v0.b[7], w7
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v3.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: uaddw2 v1.4s, v3.4s, v2.8h
; CHECK-SD-BASE-NEXT: uaddw v0.4s, v0.4s, v2.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: fmov s0, w0
; CHECK-SD-DOT-NEXT: mov x8, sp
; CHECK-SD-DOT-NEXT: ldr b1, [sp, #64]
; CHECK-SD-DOT-NEXT: add x9, sp, #72
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[1], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #80
; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: mov v0.b[1], w1
; CHECK-SD-DOT-NEXT: movi v5.8b, #1
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #88
; CHECK-SD-DOT-NEXT: mov v0.b[2], w2
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #96
; CHECK-SD-DOT-NEXT: mov v0.b[3], w3
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #104
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #112
; CHECK-SD-DOT-NEXT: mov v0.b[4], w4
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #120
; CHECK-SD-DOT-NEXT: mov v0.b[5], w5
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-DOT-NEXT: mov v0.b[6], w6
; CHECK-SD-DOT-NEXT: udot v4.2s, v1.8b, v5.8b
; CHECK-SD-DOT-NEXT: mov v0.b[7], w7
; CHECK-SD-DOT-NEXT: addp v1.2s, v4.2s, v4.2s
; CHECK-SD-DOT-NEXT: fmov w9, s1
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[8], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #8
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[9], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #16
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[10], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #24
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[11], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #32
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[12], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #40
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[13], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #48
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[14], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #56
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[15], [x8]
; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w9
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: fmov s4, w0
; CHECK-GI-BASE-NEXT: fmov s5, w4
; CHECK-GI-BASE-NEXT: ldr s0, [sp]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #8]
; CHECK-GI-BASE-NEXT: ldr s1, [sp, #32]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #40]
; CHECK-GI-BASE-NEXT: ldr s2, [sp, #64]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #72]
; CHECK-GI-BASE-NEXT: ldr s3, [sp, #96]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #104]
; CHECK-GI-BASE-NEXT: mov v4.s[1], w1
; CHECK-GI-BASE-NEXT: mov v5.s[1], w5
; CHECK-GI-BASE-NEXT: mov v0.s[1], v6.s[0]
; CHECK-GI-BASE-NEXT: mov v1.s[1], v7.s[0]
; CHECK-GI-BASE-NEXT: mov v2.s[1], v16.s[0]
; CHECK-GI-BASE-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #16]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #48]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #80]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #112]
; CHECK-GI-BASE-NEXT: mov v4.s[2], w2
; CHECK-GI-BASE-NEXT: mov v5.s[2], w6
; CHECK-GI-BASE-NEXT: mov v0.s[2], v6.s[0]
; CHECK-GI-BASE-NEXT: mov v1.s[2], v7.s[0]
; CHECK-GI-BASE-NEXT: mov v2.s[2], v16.s[0]
; CHECK-GI-BASE-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #24]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #56]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #88]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #120]
; CHECK-GI-BASE-NEXT: mov v4.s[3], w3
; CHECK-GI-BASE-NEXT: mov v5.s[3], w7
; CHECK-GI-BASE-NEXT: mov v0.s[3], v6.s[0]
; CHECK-GI-BASE-NEXT: mov v1.s[3], v7.s[0]
; CHECK-GI-BASE-NEXT: mov v2.s[3], v16.s[0]
; CHECK-GI-BASE-NEXT: mov v3.s[3], v17.s[0]
; CHECK-GI-BASE-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-BASE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-BASE-NEXT: uzp1 v1.8h, v2.8h, v3.8h
; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v4.16b, v0.16b
; CHECK-GI-BASE-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: fmov s4, w0
; CHECK-GI-DOT-NEXT: fmov s5, w4
; CHECK-GI-DOT-NEXT: ldr s0, [sp]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #8]
; CHECK-GI-DOT-NEXT: ldr s1, [sp, #32]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #40]
; CHECK-GI-DOT-NEXT: ldr s2, [sp, #64]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #72]
; CHECK-GI-DOT-NEXT: ldr s3, [sp, #96]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #104]
; CHECK-GI-DOT-NEXT: mov v4.s[1], w1
; CHECK-GI-DOT-NEXT: mov v5.s[1], w5
; CHECK-GI-DOT-NEXT: mov v0.s[1], v6.s[0]
; CHECK-GI-DOT-NEXT: mov v1.s[1], v7.s[0]
; CHECK-GI-DOT-NEXT: mov v2.s[1], v16.s[0]
; CHECK-GI-DOT-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #16]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #48]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #80]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #112]
; CHECK-GI-DOT-NEXT: mov v4.s[2], w2
; CHECK-GI-DOT-NEXT: mov v5.s[2], w6
; CHECK-GI-DOT-NEXT: mov v0.s[2], v6.s[0]
; CHECK-GI-DOT-NEXT: mov v1.s[2], v7.s[0]
; CHECK-GI-DOT-NEXT: mov v2.s[2], v16.s[0]
; CHECK-GI-DOT-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #24]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #56]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #88]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #120]
; CHECK-GI-DOT-NEXT: mov v4.s[3], w3
; CHECK-GI-DOT-NEXT: mov v5.s[3], w7
; CHECK-GI-DOT-NEXT: mov v0.s[3], v6.s[0]
; CHECK-GI-DOT-NEXT: mov v1.s[3], v7.s[0]
; CHECK-GI-DOT-NEXT: mov v2.s[3], v16.s[0]
; CHECK-GI-DOT-NEXT: mov v3.s[3], v17.s[0]
; CHECK-GI-DOT-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v2.8h, v3.8h
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.8b, #1
; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v4.16b, v0.16b
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0]
; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v3.16b
; CHECK-GI-DOT-NEXT: udot v4.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <24 x i8> %x to <24 x i32>
%z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx)
ret i32 %z
}
define i32 @add_v32i8_v32i32_zext(<32 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: uaddl v2.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: uaddl2 v5.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v1.4s, v5.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v4.4s, v3.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <32 x i8> %x to <32 x i32>
%z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx)
ret i32 %z
}
define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: fmov s0, w0
; CHECK-SD-BASE-NEXT: ldr b1, [sp, #64]
; CHECK-SD-BASE-NEXT: add x8, sp, #72
; CHECK-SD-BASE-NEXT: ldr b2, [sp]
; CHECK-SD-BASE-NEXT: add x9, sp, #80
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[1], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #8
; CHECK-SD-BASE-NEXT: mov v0.b[1], w1
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #16
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #88
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #24
; CHECK-SD-BASE-NEXT: mov v0.b[2], w2
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #96
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #32
; CHECK-SD-BASE-NEXT: mov v0.b[3], w3
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #104
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #40
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #112
; CHECK-SD-BASE-NEXT: mov v0.b[4], w4
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #48
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #120
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #56
; CHECK-SD-BASE-NEXT: mov v0.b[5], w5
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[7], [x8]
; CHECK-SD-BASE-NEXT: mov v0.b[6], w6
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-SD-BASE-NEXT: mov v0.b[7], w7
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v3.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: saddw2 v1.4s, v3.4s, v2.8h
; CHECK-SD-BASE-NEXT: saddw v0.4s, v0.4s, v2.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: fmov s0, w0
; CHECK-SD-DOT-NEXT: mov x8, sp
; CHECK-SD-DOT-NEXT: ldr b1, [sp, #64]
; CHECK-SD-DOT-NEXT: add x9, sp, #72
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[1], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #80
; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: mov v0.b[1], w1
; CHECK-SD-DOT-NEXT: movi v5.8b, #1
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #88
; CHECK-SD-DOT-NEXT: mov v0.b[2], w2
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #96
; CHECK-SD-DOT-NEXT: mov v0.b[3], w3
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #104
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #112
; CHECK-SD-DOT-NEXT: mov v0.b[4], w4
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #120
; CHECK-SD-DOT-NEXT: mov v0.b[5], w5
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-DOT-NEXT: mov v0.b[6], w6
; CHECK-SD-DOT-NEXT: sdot v4.2s, v1.8b, v5.8b
; CHECK-SD-DOT-NEXT: mov v0.b[7], w7
; CHECK-SD-DOT-NEXT: addp v1.2s, v4.2s, v4.2s
; CHECK-SD-DOT-NEXT: fmov w9, s1
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[8], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #8
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[9], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #16
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[10], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #24
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[11], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #32
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[12], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #40
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[13], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #48
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[14], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #56
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[15], [x8]
; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w9
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: fmov s4, w0
; CHECK-GI-BASE-NEXT: fmov s5, w4
; CHECK-GI-BASE-NEXT: ldr s0, [sp]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #8]
; CHECK-GI-BASE-NEXT: ldr s1, [sp, #32]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #40]
; CHECK-GI-BASE-NEXT: ldr s2, [sp, #64]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #72]
; CHECK-GI-BASE-NEXT: ldr s3, [sp, #96]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #104]
; CHECK-GI-BASE-NEXT: mov v4.s[1], w1
; CHECK-GI-BASE-NEXT: mov v5.s[1], w5
; CHECK-GI-BASE-NEXT: mov v0.s[1], v6.s[0]
; CHECK-GI-BASE-NEXT: mov v1.s[1], v7.s[0]
; CHECK-GI-BASE-NEXT: mov v2.s[1], v16.s[0]
; CHECK-GI-BASE-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #16]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #48]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #80]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #112]
; CHECK-GI-BASE-NEXT: mov v4.s[2], w2
; CHECK-GI-BASE-NEXT: mov v5.s[2], w6
; CHECK-GI-BASE-NEXT: mov v0.s[2], v6.s[0]
; CHECK-GI-BASE-NEXT: mov v1.s[2], v7.s[0]
; CHECK-GI-BASE-NEXT: mov v2.s[2], v16.s[0]
; CHECK-GI-BASE-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #24]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #56]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #88]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #120]
; CHECK-GI-BASE-NEXT: mov v4.s[3], w3
; CHECK-GI-BASE-NEXT: mov v5.s[3], w7
; CHECK-GI-BASE-NEXT: mov v0.s[3], v6.s[0]
; CHECK-GI-BASE-NEXT: mov v1.s[3], v7.s[0]
; CHECK-GI-BASE-NEXT: mov v2.s[3], v16.s[0]
; CHECK-GI-BASE-NEXT: mov v3.s[3], v17.s[0]
; CHECK-GI-BASE-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-BASE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-BASE-NEXT: uzp1 v1.8h, v2.8h, v3.8h
; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v4.16b, v0.16b
; CHECK-GI-BASE-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: fmov s4, w0
; CHECK-GI-DOT-NEXT: fmov s5, w4
; CHECK-GI-DOT-NEXT: ldr s0, [sp]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #8]
; CHECK-GI-DOT-NEXT: ldr s1, [sp, #32]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #40]
; CHECK-GI-DOT-NEXT: ldr s2, [sp, #64]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #72]
; CHECK-GI-DOT-NEXT: ldr s3, [sp, #96]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #104]
; CHECK-GI-DOT-NEXT: mov v4.s[1], w1
; CHECK-GI-DOT-NEXT: mov v5.s[1], w5
; CHECK-GI-DOT-NEXT: mov v0.s[1], v6.s[0]
; CHECK-GI-DOT-NEXT: mov v1.s[1], v7.s[0]
; CHECK-GI-DOT-NEXT: mov v2.s[1], v16.s[0]
; CHECK-GI-DOT-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #16]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #48]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #80]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #112]
; CHECK-GI-DOT-NEXT: mov v4.s[2], w2
; CHECK-GI-DOT-NEXT: mov v5.s[2], w6
; CHECK-GI-DOT-NEXT: mov v0.s[2], v6.s[0]
; CHECK-GI-DOT-NEXT: mov v1.s[2], v7.s[0]
; CHECK-GI-DOT-NEXT: mov v2.s[2], v16.s[0]
; CHECK-GI-DOT-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #24]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #56]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #88]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #120]
; CHECK-GI-DOT-NEXT: mov v4.s[3], w3
; CHECK-GI-DOT-NEXT: mov v5.s[3], w7
; CHECK-GI-DOT-NEXT: mov v0.s[3], v6.s[0]
; CHECK-GI-DOT-NEXT: mov v1.s[3], v7.s[0]
; CHECK-GI-DOT-NEXT: mov v2.s[3], v16.s[0]
; CHECK-GI-DOT-NEXT: mov v3.s[3], v17.s[0]
; CHECK-GI-DOT-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v2.8h, v3.8h
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.8b, #1
; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v4.16b, v0.16b
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0]
; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v3.16b
; CHECK-GI-DOT-NEXT: sdot v4.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <24 x i8> %x to <24 x i32>
%z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx)
ret i32 %z
}
define i32 @add_v32i8_v32i32_sext(<32 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: saddl v2.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: saddl2 v5.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v1.4s, v5.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: saddlv h1, v1.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v4.4s, v3.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <32 x i8> %x to <32 x i32>
%z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx)
ret i32 %z
}
define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
; CHECK-SD-BASE-LABEL: full:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-SD-BASE-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-BASE-NEXT: sxtw x8, w3
; CHECK-SD-BASE-NEXT: sxtw x9, w1
; CHECK-SD-BASE-NEXT: ldr d0, [x0]
; CHECK-SD-BASE-NEXT: ldr d1, [x2]
; CHECK-SD-BASE-NEXT: add x10, x0, x9
; CHECK-SD-BASE-NEXT: add x11, x2, x8
; CHECK-SD-BASE-NEXT: uabdl v0.8h, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: add x10, x10, x9
; CHECK-SD-BASE-NEXT: add x11, x11, x8
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: add x11, x11, x8
; CHECK-SD-BASE-NEXT: uaddlp v0.4s, v0.8h
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: add x10, x10, x9
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: add x11, x11, x8
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: add x10, x10, x9
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: add x11, x11, x8
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: add x10, x10, x9
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: add x11, x11, x8
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: add x10, x10, x9
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11, x8]
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10, x9]
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: full:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ldr d0, [x0]
; CHECK-SD-DOT-NEXT: ldr d1, [x2]
; CHECK-SD-DOT-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-SD-DOT-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-DOT-NEXT: sxtw x8, w3
; CHECK-SD-DOT-NEXT: sxtw x9, w1
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v3.8b, #1
; CHECK-SD-DOT-NEXT: uabd v0.8b, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: add x11, x2, x8
; CHECK-SD-DOT-NEXT: add x10, x0, x9
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: add x11, x11, x8
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: add x10, x10, x9
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: add x10, x10, x9
; CHECK-SD-DOT-NEXT: add x11, x11, x8
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: add x10, x10, x9
; CHECK-SD-DOT-NEXT: add x11, x11, x8
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: add x10, x10, x9
; CHECK-SD-DOT-NEXT: add x11, x11, x8
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: add x10, x10, x9
; CHECK-SD-DOT-NEXT: add x11, x11, x8
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10, x9]
; CHECK-SD-DOT-NEXT: ldr d4, [x11, x8]
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-LABEL: full:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-GI-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-GI-NEXT: sxtw x8, w3
; CHECK-GI-NEXT: sxtw x9, w1
; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: ldr d1, [x2]
; CHECK-GI-NEXT: add x10, x0, x9
; CHECK-GI-NEXT: add x11, x2, x8
; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-NEXT: ldr d2, [x10]
; CHECK-GI-NEXT: add x10, x10, x9
; CHECK-GI-NEXT: add x12, x11, x8
; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-NEXT: ldr d3, [x11]
; CHECK-GI-NEXT: ldr d4, [x10]
; CHECK-GI-NEXT: ldr d5, [x12]
; CHECK-GI-NEXT: add x10, x10, x9
; CHECK-GI-NEXT: add x11, x12, x8
; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0
; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0
; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0
; CHECK-GI-NEXT: uabdl v6.4s, v0.4h, v1.4h
; CHECK-GI-NEXT: uabdl2 v0.4s, v0.8h, v1.8h
; CHECK-GI-NEXT: ldr d1, [x10]
; CHECK-GI-NEXT: ldr d7, [x11]
; CHECK-GI-NEXT: add x10, x10, x9
; CHECK-GI-NEXT: add x11, x11, x8
; CHECK-GI-NEXT: uabdl v16.4s, v2.4h, v3.4h
; CHECK-GI-NEXT: uabdl2 v2.4s, v2.8h, v3.8h
; CHECK-GI-NEXT: uabdl v3.4s, v4.4h, v5.4h
; CHECK-GI-NEXT: uabdl2 v4.4s, v4.8h, v5.8h
; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-NEXT: ushll v7.8h, v7.8b, #0
; CHECK-GI-NEXT: ldr d5, [x10]
; CHECK-GI-NEXT: ldr d17, [x11]
; CHECK-GI-NEXT: add x10, x10, x9
; CHECK-GI-NEXT: add x11, x11, x8
; CHECK-GI-NEXT: add v0.4s, v6.4s, v0.4s
; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0
; CHECK-GI-NEXT: ushll v17.8h, v17.8b, #0
; CHECK-GI-NEXT: add v2.4s, v16.4s, v2.4s
; CHECK-GI-NEXT: add v3.4s, v3.4s, v4.4s
; CHECK-GI-NEXT: uabdl v4.4s, v1.4h, v7.4h
; CHECK-GI-NEXT: uabdl2 v1.4s, v1.8h, v7.8h
; CHECK-GI-NEXT: ldr d7, [x10]
; CHECK-GI-NEXT: ldr d16, [x11]
; CHECK-GI-NEXT: add x10, x10, x9
; CHECK-GI-NEXT: add x11, x11, x8
; CHECK-GI-NEXT: ldr d18, [x10]
; CHECK-GI-NEXT: ldr d20, [x10, x9]
; CHECK-GI-NEXT: ldr d19, [x11]
; CHECK-GI-NEXT: ldr d21, [x11, x8]
; CHECK-GI-NEXT: uabdl v6.4s, v5.4h, v17.4h
; CHECK-GI-NEXT: ushll v7.8h, v7.8b, #0
; CHECK-GI-NEXT: ushll v16.8h, v16.8b, #0
; CHECK-GI-NEXT: uabdl2 v5.4s, v5.8h, v17.8h
; CHECK-GI-NEXT: ushll v17.8h, v18.8b, #0
; CHECK-GI-NEXT: ushll v18.8h, v19.8b, #0
; CHECK-GI-NEXT: add v1.4s, v4.4s, v1.4s
; CHECK-GI-NEXT: ushll v4.8h, v20.8b, #0
; CHECK-GI-NEXT: ushll v19.8h, v21.8b, #0
; CHECK-GI-NEXT: addv s2, v2.4s
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: addv s3, v3.4s
; CHECK-GI-NEXT: uabdl v20.4s, v7.4h, v16.4h
; CHECK-GI-NEXT: uabdl2 v7.4s, v7.8h, v16.8h
; CHECK-GI-NEXT: add v5.4s, v6.4s, v5.4s
; CHECK-GI-NEXT: uabdl v6.4s, v17.4h, v18.4h
; CHECK-GI-NEXT: uabdl2 v16.4s, v17.8h, v18.8h
; CHECK-GI-NEXT: uabdl v17.4s, v4.4h, v19.4h
; CHECK-GI-NEXT: uabdl2 v4.4s, v4.8h, v19.8h
; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: addv s1, v1.4s
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: fmov w10, s3
; CHECK-GI-NEXT: add v7.4s, v20.4s, v7.4s
; CHECK-GI-NEXT: add v0.4s, v17.4s, v4.4s
; CHECK-GI-NEXT: addv s4, v5.4s
; CHECK-GI-NEXT: add v2.4s, v6.4s, v16.4s
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w10, w8
; CHECK-GI-NEXT: addv s3, v7.4s
; CHECK-GI-NEXT: addv s1, v2.4s
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: add w8, w9, w8
; CHECK-GI-NEXT: fmov w9, s4
; CHECK-GI-NEXT: add w8, w9, w8
; CHECK-GI-NEXT: fmov w9, s3
; CHECK-GI-NEXT: add w8, w9, w8
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w9, w8
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add w0, w9, w8
; CHECK-GI-NEXT: ret
entry:
%idx.ext8 = sext i32 %s2 to i64
%idx.ext = sext i32 %s1 to i64
%0 = load <8 x i8>, ptr %p1, align 1
%1 = zext <8 x i8> %0 to <8 x i32>
%2 = load <8 x i8>, ptr %p2, align 1
%3 = zext <8 x i8> %2 to <8 x i32>
%4 = sub nsw <8 x i32> %1, %3
%5 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %4, i1 true)
%6 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
%add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext
%add.ptr9 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext8
%7 = load <8 x i8>, ptr %add.ptr, align 1
%8 = zext <8 x i8> %7 to <8 x i32>
%9 = load <8 x i8>, ptr %add.ptr9, align 1
%10 = zext <8 x i8> %9 to <8 x i32>
%11 = sub nsw <8 x i32> %8, %10
%12 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %11, i1 true)
%13 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
%op.rdx.1 = add i32 %13, %6
%add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
%add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8
%14 = load <8 x i8>, ptr %add.ptr.1, align 1
%15 = zext <8 x i8> %14 to <8 x i32>
%16 = load <8 x i8>, ptr %add.ptr9.1, align 1
%17 = zext <8 x i8> %16 to <8 x i32>
%18 = sub nsw <8 x i32> %15, %17
%19 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %18, i1 true)
%20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19)
%op.rdx.2 = add i32 %20, %op.rdx.1
%add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
%add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8
%21 = load <8 x i8>, ptr %add.ptr.2, align 1
%22 = zext <8 x i8> %21 to <8 x i32>
%23 = load <8 x i8>, ptr %add.ptr9.2, align 1
%24 = zext <8 x i8> %23 to <8 x i32>
%25 = sub nsw <8 x i32> %22, %24
%26 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %25, i1 true)
%27 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %26)
%op.rdx.3 = add i32 %27, %op.rdx.2
%add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %idx.ext
%add.ptr9.3 = getelementptr inbounds i8, ptr %add.ptr9.2, i64 %idx.ext8
%28 = load <8 x i8>, ptr %add.ptr.3, align 1
%29 = zext <8 x i8> %28 to <8 x i32>
%30 = load <8 x i8>, ptr %add.ptr9.3, align 1
%31 = zext <8 x i8> %30 to <8 x i32>
%32 = sub nsw <8 x i32> %29, %31
%33 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %32, i1 true)
%34 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %33)
%op.rdx.4 = add i32 %34, %op.rdx.3
%add.ptr.4 = getelementptr inbounds i8, ptr %add.ptr.3, i64 %idx.ext
%add.ptr9.4 = getelementptr inbounds i8, ptr %add.ptr9.3, i64 %idx.ext8
%35 = load <8 x i8>, ptr %add.ptr.4, align 1
%36 = zext <8 x i8> %35 to <8 x i32>
%37 = load <8 x i8>, ptr %add.ptr9.4, align 1
%38 = zext <8 x i8> %37 to <8 x i32>
%39 = sub nsw <8 x i32> %36, %38
%40 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %39, i1 true)
%41 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40)
%op.rdx.5 = add i32 %41, %op.rdx.4
%add.ptr.5 = getelementptr inbounds i8, ptr %add.ptr.4, i64 %idx.ext
%add.ptr9.5 = getelementptr inbounds i8, ptr %add.ptr9.4, i64 %idx.ext8
%42 = load <8 x i8>, ptr %add.ptr.5, align 1
%43 = zext <8 x i8> %42 to <8 x i32>
%44 = load <8 x i8>, ptr %add.ptr9.5, align 1
%45 = zext <8 x i8> %44 to <8 x i32>
%46 = sub nsw <8 x i32> %43, %45
%47 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %46, i1 true)
%48 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47)
%op.rdx.6 = add i32 %48, %op.rdx.5
%add.ptr.6 = getelementptr inbounds i8, ptr %add.ptr.5, i64 %idx.ext
%add.ptr9.6 = getelementptr inbounds i8, ptr %add.ptr9.5, i64 %idx.ext8
%49 = load <8 x i8>, ptr %add.ptr.6, align 1
%50 = zext <8 x i8> %49 to <8 x i32>
%51 = load <8 x i8>, ptr %add.ptr9.6, align 1
%52 = zext <8 x i8> %51 to <8 x i32>
%53 = sub nsw <8 x i32> %50, %52
%54 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %53, i1 true)
%55 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %54)
%op.rdx.7 = add i32 %55, %op.rdx.6
ret i32 %op.rdx.7
}
define i32 @extract_hi_lo(<8 x i16> %a) {
; CHECK-SD-LABEL: extract_hi_lo:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: extract_hi_lo:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%z1 = zext <4 x i16> %e1 to <4 x i32>
%z2 = zext <4 x i16> %e2 to <4 x i32>
%z4 = add <4 x i32> %z1, %z2
%z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4)
ret i32 %z5
}
define i32 @extract_hi_hi(<8 x i16> %a) {
; CHECK-SD-LABEL: extract_hi_hi:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mov v0.d[1], v0.d[0]
; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: extract_hi_hi:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddl2 v0.4s, v0.8h, v0.8h
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%z2 = zext <4 x i16> %e2 to <4 x i32>
%z4 = add <4 x i32> %z2, %z2
%z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4)
ret i32 %z5
}
define i32 @extract_lo_lo(<8 x i16> %a) {
; CHECK-SD-LABEL: extract_lo_lo:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov v0.d[1], v0.d[0]
; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: extract_lo_lo:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddl v0.4s, v0.4h, v0.4h
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%z1 = zext <4 x i16> %e1 to <4 x i32>
%z4 = add <4 x i32> %z1, %z1
%z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4)
ret i32 %z5
}
declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
declare i16 @llvm.vector.reduce.add.v24i16(<24 x i16>)
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>)
declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>)
declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)