Files
clang-p2996/llvm/test/CodeGen/AArch64/vecreduce-add.ll
Zhaoxuan Jiang 147c5d6686 [AArch64] Allow LDR merge with same destination register by renaming (#71908)
The patch is based on a reverted patch:
https://reviews.llvm.org/D103597. It was trying to rename registers
before alias check, which is not safe and causes miscompiles. This patch
does 2 things:

1. Do the renaming with necessary checks passed, including alias check.
2. Rename the register for the instructions between the pairs and
combine the second load into the first. By doing so we can just check
the renamability between the pairs and avoid scanning unknown amount of
instructions before/after the pairs.

Necessary refactoring has been made in order to reuse as much code
possible with STR renaming.
2023-11-23 08:21:27 +00:00

6643 lines
254 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE,CHECK-SD-BASE
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-SD-DOT
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BASE,CHECK-GI-BASE
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-GI-DOT
; CHECK-GI-BASE: warning: Instruction selection used fallback path for full
define i32 @addv_v2i32(<2 x i32> %a) {
; CHECK-LABEL: addv_v2i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
ret i32 %arg1
}
define i16 @addv_v4i16(<4 x i16> %a) {
; CHECK-LABEL: addv_v4i16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv h0, v0.4h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
ret i16 %arg1
}
define i32 @add_v4i32_v4i32(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
ret i32 %z
}
define i8 @addv_v8i8(<8 x i8> %a) {
; CHECK-LABEL: addv_v8i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv b0, v0.8b
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
ret i8 %arg1
}
define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlv s0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlv s0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i32_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i32_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i16:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i16:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i16:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: uxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i16:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: uxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
ret i16 %z
}
define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v2i16_v2i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v2i16_v2i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v2i16_v2i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v2i16_v2i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #48
; CHECK-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: add_v16i8_v16i32_zext:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v1.16b, #1
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v3.4s, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
ret i32 %z
}
define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: add_v16i8_v16i32_sext:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v1.16b, #1
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.4s, v2.4s, v1.8h
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v3.4s, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: udot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x0000ff000000ff
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x0000ff000000ff
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
; CHECK-LABEL: add_v4i8_v4i32_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: shl v0.4s, v0.4s, #24
; CHECK-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlp v0.8h, v0.16b
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlp v0.8h, v0.16b
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: uxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: uxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
ret i16 %z
}
define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlp v0.8h, v0.16b
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: smov w0, v0.h[0]
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlp v0.8h, v0.16b
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: smov w0, v0.h[0]
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
ret i16 %z
}
define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: uxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: uxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
ret i16 %z
}
define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: smov w0, v0.h[0]
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: smov w0, v0.h[0]
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
ret i16 %z
}
define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: addv b0, v0.16b
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: addv b0, v0.16b
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv b0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: uxtb w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv b0, v0.16b
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: uxtb w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
ret i8 %z
}
define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-BASE-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-BASE-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-DOT-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-DOT-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: ushll v5.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: ushll v7.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: ushll v5.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: ushll v7.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
ret i64 %z
}
define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-BASE-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-BASE-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-DOT-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-DOT-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: sshll v5.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: sshll v7.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: sshll v5.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: sshll v7.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: and v2.16b, v2.16b, v1.16b
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: and v2.16b, v2.16b, v1.16b
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: addp d0, v1.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: addp d0, v1.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v2i8_v2i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v2i8_v2i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v2i8_v2i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v2i8_v2i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #56
; CHECK-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i64_v2i64(<2 x i64> %x) {
; CHECK-LABEL: add_v2i64_v2i64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
ret i64 %z
}
define i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) {
; CHECK-LABEL: add_v4i32_v4i32_acc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
%r = add i32 %z, %a
ret i32 %r
}
define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlv s0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlv s0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i16_acc:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: and w0, w8, #0xffff
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i16_acc:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: and w0, w8, #0xffff
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i16_acc:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i16_acc:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
%r = add i16 %z, %a
ret i16 %r
}
define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #48
; CHECK-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v1.16b, #1
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w8, s0
; CHECK-DOT-NEXT: add w0, w8, w0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v3.4s, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v1.16b, #1
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w8, s0
; CHECK-DOT-NEXT: add w0, w8, w0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.4s, v2.4s, v1.8h
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v3.4s, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: udot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x0000ff000000ff
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x0000ff000000ff
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: shl v0.4s, v0.4s, #24
; CHECK-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: and w0, w8, #0xffff
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlv h0, v0.16b
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: and w0, w8, #0xffff
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlv h0, v0.16b
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: sxth w0, w8
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlv h0, v0.16b
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: sxth w0, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxth
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxth
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: and w0, w8, #0xffff
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: and w0, w8, #0xffff
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: sxth w0, w8
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: sxth w0, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxth
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxth
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i8_acc:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: addv b0, v0.16b
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: and w0, w8, #0xff
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i8_acc:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: addv b0, v0.16b
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: and w0, w8, #0xff
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i8_acc:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv b0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxtb
; CHECK-GI-BASE-NEXT: and w0, w8, #0xff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i8_acc:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv b0, v0.16b
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxtb
; CHECK-GI-DOT-NEXT: and w0, w8, #0xff
; CHECK-GI-DOT-NEXT: ret
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
%r = add i8 %z, %a
ret i8 %r
}
define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-BASE-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-BASE-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-DOT-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-DOT-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: ushll v5.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: ushll v7.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: ushll v5.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: ushll v7.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-BASE-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-BASE-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-DOT-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-DOT-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: sshll v5.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: sshll v7.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: sshll v5.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: sshll v7.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: and v2.16b, v2.16b, v1.16b
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: and v2.16b, v2.16b, v1.16b
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: addp d0, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: addp d0, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #56
; CHECK-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
; CHECK-LABEL: add_v2i64_v2i64_acc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i32:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i32:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i32:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i32:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
%z = add i32 %z1, %z2
ret i32 %z
}
define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-BASE-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-DOT-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i32> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-BASE-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-DOT-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i32> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i32> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i32> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-DOT-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-DOT-NEXT: addv s0, v1.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-DOT-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = zext <8 x i16> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: sadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlp v1.4s, v1.8h
; CHECK-SD-DOT-NEXT: sadalp v1.4s, v0.8h
; CHECK-SD-DOT-NEXT: addv s0, v1.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-BASE-NEXT: saddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-DOT-NEXT: saddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = sext <8 x i16> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = zext <4 x i16> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = sext <4 x i16> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-BASE-LABEL: test_udot_v8i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: umull v2.4s, v1.4h, v0.4h
; CHECK-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
; CHECK-BASE-NEXT: addv s0, v2.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: test_udot_v8i8:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
entry:
%0 = zext <8 x i8> %a to <8 x i32>
%1 = zext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_udot_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: umull v4.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: umlal v4.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v4.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: test_udot_v16i8:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: udot v2.4s, v1.16b, v0.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: umull v4.4s, v3.4h, v2.4h
; CHECK-GI-BASE-NEXT: umull v5.4s, v1.4h, v0.4h
; CHECK-GI-BASE-NEXT: umlal2 v4.4s, v3.8h, v2.8h
; CHECK-GI-BASE-NEXT: umlal2 v5.4s, v1.8h, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v4.4s, v5.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
entry:
%0 = zext <16 x i8> %a to <16 x i32>
%1 = zext <16 x i8> %b to <16 x i32>
%2 = mul nuw nsw <16 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
; CHECK-BASE-LABEL: test_udot_v24i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ldr q0, [x0]
; CHECK-BASE-NEXT: ldr q1, [x1]
; CHECK-BASE-NEXT: ldr d4, [x0, #16]
; CHECK-BASE-NEXT: ldr d5, [x1, #16]
; CHECK-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-BASE-NEXT: umull v6.4s, v3.4h, v2.4h
; CHECK-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h
; CHECK-BASE-NEXT: ushll v3.8h, v4.8b, #0
; CHECK-BASE-NEXT: ushll v4.8h, v5.8b, #0
; CHECK-BASE-NEXT: umlal2 v2.4s, v4.8h, v3.8h
; CHECK-BASE-NEXT: umlal v6.4s, v4.4h, v3.4h
; CHECK-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
; CHECK-BASE-NEXT: umlal v6.4s, v1.4h, v0.4h
; CHECK-BASE-NEXT: add v0.4s, v6.4s, v2.4s
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v24i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q2, [x0]
; CHECK-SD-DOT-NEXT: ldr q3, [x1]
; CHECK-SD-DOT-NEXT: ldr d4, [x0, #16]
; CHECK-SD-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-SD-DOT-NEXT: udot v1.2s, v5.8b, v4.8b
; CHECK-SD-DOT-NEXT: udot v0.4s, v3.16b, v2.16b
; CHECK-SD-DOT-NEXT: addp v1.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w8, s1
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: add w0, w9, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v24i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ldr b1, [x0]
; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1]
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr b3, [x0, #8]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9]
; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr b4, [x1]
; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #1]
; CHECK-GI-DOT-NEXT: ldr b6, [x1, #8]
; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #9]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #17]
; CHECK-GI-DOT-NEXT: mov v4.b[1], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16]
; CHECK-GI-DOT-NEXT: ldr b19, [x1, #17]
; CHECK-GI-DOT-NEXT: mov v6.b[1], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #2]
; CHECK-GI-DOT-NEXT: mov v2.b[1], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #10]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #2]
; CHECK-GI-DOT-NEXT: mov v5.b[1], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #10]
; CHECK-GI-DOT-NEXT: mov v1.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #18]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #18]
; CHECK-GI-DOT-NEXT: mov v3.b[2], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #3]
; CHECK-GI-DOT-NEXT: mov v2.b[2], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #3]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #11]
; CHECK-GI-DOT-NEXT: mov v5.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #19]
; CHECK-GI-DOT-NEXT: mov v1.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #19]
; CHECK-GI-DOT-NEXT: mov v3.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[3], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[3], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4]
; CHECK-GI-DOT-NEXT: mov v2.b[3], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #4]
; CHECK-GI-DOT-NEXT: mov v5.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #12]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #12]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #20]
; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #20]
; CHECK-GI-DOT-NEXT: mov v3.b[4], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[4], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #5]
; CHECK-GI-DOT-NEXT: mov v2.b[4], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #5]
; CHECK-GI-DOT-NEXT: mov v5.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #13]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #21]
; CHECK-GI-DOT-NEXT: mov v1.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #21]
; CHECK-GI-DOT-NEXT: mov v3.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[5], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[5], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6]
; CHECK-GI-DOT-NEXT: mov v2.b[5], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #6]
; CHECK-GI-DOT-NEXT: mov v5.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #14]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #14]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #22]
; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #22]
; CHECK-GI-DOT-NEXT: mov v3.b[6], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[6], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #7]
; CHECK-GI-DOT-NEXT: mov v2.b[6], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #7]
; CHECK-GI-DOT-NEXT: mov v5.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #15]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #23]
; CHECK-GI-DOT-NEXT: mov v1.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #23]
; CHECK-GI-DOT-NEXT: mov v3.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[7], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[7], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v2.b[7], v19.b[0]
; CHECK-GI-DOT-NEXT: mov v5.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.d[1], v6.d[0]
; CHECK-GI-DOT-NEXT: mov v1.d[1], v3.d[0]
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v2.d[1], v0.d[0]
; CHECK-GI-DOT-NEXT: mov v5.d[1], v0.d[0]
; CHECK-GI-DOT-NEXT: udot v3.4s, v4.16b, v1.16b
; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <24 x i8>, ptr %p1
%b = load <24 x i8>, ptr %p2
%0 = zext <24 x i8> %a to <24 x i32>
%1 = zext <24 x i8> %b to <24 x i32>
%2 = mul nuw nsw <24 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
; CHECK-BASE-LABEL: test_udot_v48i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ldp q0, q4, [x1]
; CHECK-BASE-NEXT: ldr q2, [x0, #32]
; CHECK-BASE-NEXT: ldp q1, q3, [x0]
; CHECK-BASE-NEXT: ldr q7, [x1, #32]
; CHECK-BASE-NEXT: ushll2 v16.8h, v2.16b, #0
; CHECK-BASE-NEXT: ushll2 v6.8h, v0.16b, #0
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: ushll2 v17.8h, v7.16b, #0
; CHECK-BASE-NEXT: ushll2 v5.8h, v1.16b, #0
; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: umull2 v18.4s, v6.8h, v5.8h
; CHECK-BASE-NEXT: umull v19.4s, v0.4h, v1.4h
; CHECK-BASE-NEXT: umull v5.4s, v6.4h, v5.4h
; CHECK-BASE-NEXT: umull2 v0.4s, v0.8h, v1.8h
; CHECK-BASE-NEXT: ushll v1.8h, v2.8b, #0
; CHECK-BASE-NEXT: ushll v2.8h, v7.8b, #0
; CHECK-BASE-NEXT: ushll2 v6.8h, v3.16b, #0
; CHECK-BASE-NEXT: ushll2 v7.8h, v4.16b, #0
; CHECK-BASE-NEXT: umlal2 v18.4s, v17.8h, v16.8h
; CHECK-BASE-NEXT: umlal v5.4s, v17.4h, v16.4h
; CHECK-BASE-NEXT: umlal v19.4s, v2.4h, v1.4h
; CHECK-BASE-NEXT: umlal2 v0.4s, v2.8h, v1.8h
; CHECK-BASE-NEXT: ushll v1.8h, v3.8b, #0
; CHECK-BASE-NEXT: ushll v2.8h, v4.8b, #0
; CHECK-BASE-NEXT: umlal2 v18.4s, v7.8h, v6.8h
; CHECK-BASE-NEXT: umlal v5.4s, v7.4h, v6.4h
; CHECK-BASE-NEXT: umlal v19.4s, v2.4h, v1.4h
; CHECK-BASE-NEXT: umlal2 v0.4s, v2.8h, v1.8h
; CHECK-BASE-NEXT: add v1.4s, v19.4s, v5.4s
; CHECK-BASE-NEXT: add v0.4s, v0.4s, v18.4s
; CHECK-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v48i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q1, [x0, #32]
; CHECK-SD-DOT-NEXT: ldr q2, [x1, #32]
; CHECK-SD-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: ldp q3, q1, [x0]
; CHECK-SD-DOT-NEXT: ldp q4, q2, [x1]
; CHECK-SD-DOT-NEXT: udot v0.4s, v4.16b, v3.16b
; CHECK-SD-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ldr b1, [x0]
; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1]
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr b6, [x0, #17]
; CHECK-GI-DOT-NEXT: ldr b4, [x1]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #1]
; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0]
; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #17]
; CHECK-GI-DOT-NEXT: mov v2.b[1], v6.b[0]
; CHECK-GI-DOT-NEXT: ldr b3, [x0, #32]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #33]
; CHECK-GI-DOT-NEXT: mov v4.b[1], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b6, [x1, #32]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #33]
; CHECK-GI-DOT-NEXT: mov v5.b[1], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #2]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #18]
; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #2]
; CHECK-GI-DOT-NEXT: mov v6.b[1], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #18]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #34]
; CHECK-GI-DOT-NEXT: mov v2.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #34]
; CHECK-GI-DOT-NEXT: mov v4.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #3]
; CHECK-GI-DOT-NEXT: mov v5.b[2], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #19]
; CHECK-GI-DOT-NEXT: mov v3.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #19]
; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #3]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #35]
; CHECK-GI-DOT-NEXT: mov v2.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #35]
; CHECK-GI-DOT-NEXT: mov v4.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4]
; CHECK-GI-DOT-NEXT: mov v5.b[3], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #20]
; CHECK-GI-DOT-NEXT: mov v3.b[3], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #36]
; CHECK-GI-DOT-NEXT: mov v6.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #4]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #20]
; CHECK-GI-DOT-NEXT: mov v2.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #36]
; CHECK-GI-DOT-NEXT: mov v4.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #5]
; CHECK-GI-DOT-NEXT: mov v5.b[4], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #21]
; CHECK-GI-DOT-NEXT: mov v3.b[4], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #5]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #21]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #37]
; CHECK-GI-DOT-NEXT: mov v2.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #37]
; CHECK-GI-DOT-NEXT: mov v4.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6]
; CHECK-GI-DOT-NEXT: mov v5.b[5], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #22]
; CHECK-GI-DOT-NEXT: mov v3.b[5], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #6]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #22]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #38]
; CHECK-GI-DOT-NEXT: mov v2.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #38]
; CHECK-GI-DOT-NEXT: mov v4.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #7]
; CHECK-GI-DOT-NEXT: mov v5.b[6], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #23]
; CHECK-GI-DOT-NEXT: mov v3.b[6], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #7]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #23]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #39]
; CHECK-GI-DOT-NEXT: mov v2.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #39]
; CHECK-GI-DOT-NEXT: mov v4.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #8]
; CHECK-GI-DOT-NEXT: mov v5.b[7], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #24]
; CHECK-GI-DOT-NEXT: mov v3.b[7], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[8], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #8]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #24]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #40]
; CHECK-GI-DOT-NEXT: mov v2.b[8], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #40]
; CHECK-GI-DOT-NEXT: mov v4.b[8], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9]
; CHECK-GI-DOT-NEXT: mov v5.b[8], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #25]
; CHECK-GI-DOT-NEXT: mov v3.b[8], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[8], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[9], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #9]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #25]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #41]
; CHECK-GI-DOT-NEXT: mov v2.b[9], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #41]
; CHECK-GI-DOT-NEXT: mov v4.b[9], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #10]
; CHECK-GI-DOT-NEXT: mov v5.b[9], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #26]
; CHECK-GI-DOT-NEXT: mov v3.b[9], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[9], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[10], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #10]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #26]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #42]
; CHECK-GI-DOT-NEXT: mov v2.b[10], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #42]
; CHECK-GI-DOT-NEXT: mov v4.b[10], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11]
; CHECK-GI-DOT-NEXT: mov v5.b[10], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #27]
; CHECK-GI-DOT-NEXT: mov v3.b[10], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[10], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[11], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #11]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #27]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #43]
; CHECK-GI-DOT-NEXT: mov v2.b[11], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #43]
; CHECK-GI-DOT-NEXT: mov v4.b[11], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #12]
; CHECK-GI-DOT-NEXT: mov v5.b[11], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #28]
; CHECK-GI-DOT-NEXT: mov v3.b[11], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[11], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[12], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #12]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #28]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #44]
; CHECK-GI-DOT-NEXT: mov v2.b[12], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #44]
; CHECK-GI-DOT-NEXT: mov v4.b[12], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13]
; CHECK-GI-DOT-NEXT: mov v5.b[12], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #29]
; CHECK-GI-DOT-NEXT: mov v3.b[12], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[12], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[13], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #13]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #29]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #45]
; CHECK-GI-DOT-NEXT: mov v2.b[13], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #45]
; CHECK-GI-DOT-NEXT: mov v4.b[13], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #14]
; CHECK-GI-DOT-NEXT: mov v5.b[13], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #30]
; CHECK-GI-DOT-NEXT: mov v3.b[13], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[13], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[14], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #14]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #30]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #46]
; CHECK-GI-DOT-NEXT: mov v2.b[14], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #46]
; CHECK-GI-DOT-NEXT: mov v4.b[14], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15]
; CHECK-GI-DOT-NEXT: mov v5.b[14], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #31]
; CHECK-GI-DOT-NEXT: mov v3.b[14], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[14], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[15], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #15]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #31]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #47]
; CHECK-GI-DOT-NEXT: mov v2.b[15], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #47]
; CHECK-GI-DOT-NEXT: mov v4.b[15], v7.b[0]
; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v5.b[15], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v3.b[15], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[15], v16.b[0]
; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v0.4s, v4.16b, v1.16b
; CHECK-GI-DOT-NEXT: udot v7.4s, v5.16b, v2.16b
; CHECK-GI-DOT-NEXT: udot v16.4s, v6.16b, v3.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v7.4s
; CHECK-GI-DOT-NEXT: addv s2, v16.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: fmov w10, s2
; CHECK-GI-DOT-NEXT: add w8, w8, w9
; CHECK-GI-DOT-NEXT: add w0, w8, w10
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <48 x i8>, ptr %p1
%b = load <48 x i8>, ptr %p2
%0 = zext <48 x i8> %a to <48 x i32>
%1 = zext <48 x i8> %b to <48 x i32>
%2 = mul nuw nsw <48 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-BASE-LABEL: test_sdot_v8i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: smull v2.4s, v1.4h, v0.4h
; CHECK-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h
; CHECK-BASE-NEXT: addv s0, v2.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: test_sdot_v8i8:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: sdot v2.2s, v1.8b, v0.8b
; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
entry:
%0 = sext <8 x i8> %a to <8 x i32>
%1 = sext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_sdot_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: smull v4.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: smlal v4.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v4.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: test_sdot_v16i8:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: sdot v2.4s, v1.16b, v0.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_sdot_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: smull v4.4s, v3.4h, v2.4h
; CHECK-GI-BASE-NEXT: smull v5.4s, v1.4h, v0.4h
; CHECK-GI-BASE-NEXT: smlal2 v4.4s, v3.8h, v2.8h
; CHECK-GI-BASE-NEXT: smlal2 v5.4s, v1.8h, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v4.4s, v5.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
entry:
%0 = sext <16 x i8> %a to <16 x i32>
%1 = sext <16 x i8> %b to <16 x i32>
%2 = mul nuw nsw <16 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
; CHECK-BASE-LABEL: test_sdot_v24i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ldr q0, [x0]
; CHECK-BASE-NEXT: ldr q1, [x1]
; CHECK-BASE-NEXT: ldr d4, [x0, #16]
; CHECK-BASE-NEXT: ldr d5, [x1, #16]
; CHECK-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-BASE-NEXT: smull v6.4s, v3.4h, v2.4h
; CHECK-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h
; CHECK-BASE-NEXT: sshll v3.8h, v4.8b, #0
; CHECK-BASE-NEXT: sshll v4.8h, v5.8b, #0
; CHECK-BASE-NEXT: smlal2 v2.4s, v4.8h, v3.8h
; CHECK-BASE-NEXT: smlal v6.4s, v4.4h, v3.4h
; CHECK-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h
; CHECK-BASE-NEXT: smlal v6.4s, v1.4h, v0.4h
; CHECK-BASE-NEXT: add v0.4s, v6.4s, v2.4s
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v24i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q2, [x0]
; CHECK-SD-DOT-NEXT: ldr q3, [x1]
; CHECK-SD-DOT-NEXT: ldr d4, [x0, #16]
; CHECK-SD-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-SD-DOT-NEXT: sdot v1.2s, v5.8b, v4.8b
; CHECK-SD-DOT-NEXT: sdot v0.4s, v3.16b, v2.16b
; CHECK-SD-DOT-NEXT: addp v1.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w8, s1
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: add w0, w9, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ldr b1, [x0]
; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1]
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr b3, [x0, #8]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9]
; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr b4, [x1]
; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #1]
; CHECK-GI-DOT-NEXT: ldr b6, [x1, #8]
; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #9]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #17]
; CHECK-GI-DOT-NEXT: mov v4.b[1], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16]
; CHECK-GI-DOT-NEXT: ldr b19, [x1, #17]
; CHECK-GI-DOT-NEXT: mov v6.b[1], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #2]
; CHECK-GI-DOT-NEXT: mov v2.b[1], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #10]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #2]
; CHECK-GI-DOT-NEXT: mov v5.b[1], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #10]
; CHECK-GI-DOT-NEXT: mov v1.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #18]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #18]
; CHECK-GI-DOT-NEXT: mov v3.b[2], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #3]
; CHECK-GI-DOT-NEXT: mov v2.b[2], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #3]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #11]
; CHECK-GI-DOT-NEXT: mov v5.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #19]
; CHECK-GI-DOT-NEXT: mov v1.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #19]
; CHECK-GI-DOT-NEXT: mov v3.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[3], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[3], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4]
; CHECK-GI-DOT-NEXT: mov v2.b[3], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #4]
; CHECK-GI-DOT-NEXT: mov v5.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #12]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #12]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #20]
; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #20]
; CHECK-GI-DOT-NEXT: mov v3.b[4], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[4], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #5]
; CHECK-GI-DOT-NEXT: mov v2.b[4], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #5]
; CHECK-GI-DOT-NEXT: mov v5.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #13]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #21]
; CHECK-GI-DOT-NEXT: mov v1.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #21]
; CHECK-GI-DOT-NEXT: mov v3.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[5], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[5], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6]
; CHECK-GI-DOT-NEXT: mov v2.b[5], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #6]
; CHECK-GI-DOT-NEXT: mov v5.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #14]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #14]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #22]
; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #22]
; CHECK-GI-DOT-NEXT: mov v3.b[6], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[6], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #7]
; CHECK-GI-DOT-NEXT: mov v2.b[6], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #7]
; CHECK-GI-DOT-NEXT: mov v5.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #15]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #23]
; CHECK-GI-DOT-NEXT: mov v1.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #23]
; CHECK-GI-DOT-NEXT: mov v3.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[7], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[7], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v2.b[7], v19.b[0]
; CHECK-GI-DOT-NEXT: mov v5.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.d[1], v6.d[0]
; CHECK-GI-DOT-NEXT: mov v1.d[1], v3.d[0]
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v2.d[1], v0.d[0]
; CHECK-GI-DOT-NEXT: mov v5.d[1], v0.d[0]
; CHECK-GI-DOT-NEXT: sdot v3.4s, v4.16b, v1.16b
; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <24 x i8>, ptr %p1
%b = load <24 x i8>, ptr %p2
%0 = sext <24 x i8> %a to <24 x i32>
%1 = sext <24 x i8> %b to <24 x i32>
%2 = mul nuw nsw <24 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
; CHECK-BASE-LABEL: test_sdot_v48i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ldp q0, q4, [x1]
; CHECK-BASE-NEXT: ldr q2, [x0, #32]
; CHECK-BASE-NEXT: ldp q1, q3, [x0]
; CHECK-BASE-NEXT: ldr q7, [x1, #32]
; CHECK-BASE-NEXT: sshll2 v16.8h, v2.16b, #0
; CHECK-BASE-NEXT: sshll2 v6.8h, v0.16b, #0
; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: sshll2 v17.8h, v7.16b, #0
; CHECK-BASE-NEXT: sshll2 v5.8h, v1.16b, #0
; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: smull2 v18.4s, v6.8h, v5.8h
; CHECK-BASE-NEXT: smull v19.4s, v0.4h, v1.4h
; CHECK-BASE-NEXT: smull v5.4s, v6.4h, v5.4h
; CHECK-BASE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-BASE-NEXT: sshll v1.8h, v2.8b, #0
; CHECK-BASE-NEXT: sshll v2.8h, v7.8b, #0
; CHECK-BASE-NEXT: sshll2 v6.8h, v3.16b, #0
; CHECK-BASE-NEXT: sshll2 v7.8h, v4.16b, #0
; CHECK-BASE-NEXT: smlal2 v18.4s, v17.8h, v16.8h
; CHECK-BASE-NEXT: smlal v5.4s, v17.4h, v16.4h
; CHECK-BASE-NEXT: smlal v19.4s, v2.4h, v1.4h
; CHECK-BASE-NEXT: smlal2 v0.4s, v2.8h, v1.8h
; CHECK-BASE-NEXT: sshll v1.8h, v3.8b, #0
; CHECK-BASE-NEXT: sshll v2.8h, v4.8b, #0
; CHECK-BASE-NEXT: smlal2 v18.4s, v7.8h, v6.8h
; CHECK-BASE-NEXT: smlal v5.4s, v7.4h, v6.4h
; CHECK-BASE-NEXT: smlal v19.4s, v2.4h, v1.4h
; CHECK-BASE-NEXT: smlal2 v0.4s, v2.8h, v1.8h
; CHECK-BASE-NEXT: add v1.4s, v19.4s, v5.4s
; CHECK-BASE-NEXT: add v0.4s, v0.4s, v18.4s
; CHECK-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v48i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q1, [x0, #32]
; CHECK-SD-DOT-NEXT: ldr q2, [x1, #32]
; CHECK-SD-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: ldp q3, q1, [x0]
; CHECK-SD-DOT-NEXT: ldp q4, q2, [x1]
; CHECK-SD-DOT-NEXT: sdot v0.4s, v4.16b, v3.16b
; CHECK-SD-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ldr b1, [x0]
; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1]
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr b6, [x0, #17]
; CHECK-GI-DOT-NEXT: ldr b4, [x1]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #1]
; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0]
; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #17]
; CHECK-GI-DOT-NEXT: mov v2.b[1], v6.b[0]
; CHECK-GI-DOT-NEXT: ldr b3, [x0, #32]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #33]
; CHECK-GI-DOT-NEXT: mov v4.b[1], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b6, [x1, #32]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #33]
; CHECK-GI-DOT-NEXT: mov v5.b[1], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #2]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #18]
; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #2]
; CHECK-GI-DOT-NEXT: mov v6.b[1], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #18]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #34]
; CHECK-GI-DOT-NEXT: mov v2.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #34]
; CHECK-GI-DOT-NEXT: mov v4.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #3]
; CHECK-GI-DOT-NEXT: mov v5.b[2], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #19]
; CHECK-GI-DOT-NEXT: mov v3.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #19]
; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #3]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #35]
; CHECK-GI-DOT-NEXT: mov v2.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #35]
; CHECK-GI-DOT-NEXT: mov v4.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4]
; CHECK-GI-DOT-NEXT: mov v5.b[3], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #20]
; CHECK-GI-DOT-NEXT: mov v3.b[3], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #36]
; CHECK-GI-DOT-NEXT: mov v6.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #4]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #20]
; CHECK-GI-DOT-NEXT: mov v2.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #36]
; CHECK-GI-DOT-NEXT: mov v4.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #5]
; CHECK-GI-DOT-NEXT: mov v5.b[4], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #21]
; CHECK-GI-DOT-NEXT: mov v3.b[4], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #5]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #21]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #37]
; CHECK-GI-DOT-NEXT: mov v2.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #37]
; CHECK-GI-DOT-NEXT: mov v4.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6]
; CHECK-GI-DOT-NEXT: mov v5.b[5], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #22]
; CHECK-GI-DOT-NEXT: mov v3.b[5], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #6]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #22]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #38]
; CHECK-GI-DOT-NEXT: mov v2.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #38]
; CHECK-GI-DOT-NEXT: mov v4.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #7]
; CHECK-GI-DOT-NEXT: mov v5.b[6], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #23]
; CHECK-GI-DOT-NEXT: mov v3.b[6], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #7]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #23]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #39]
; CHECK-GI-DOT-NEXT: mov v2.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #39]
; CHECK-GI-DOT-NEXT: mov v4.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #8]
; CHECK-GI-DOT-NEXT: mov v5.b[7], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #24]
; CHECK-GI-DOT-NEXT: mov v3.b[7], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[8], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #8]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #24]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #40]
; CHECK-GI-DOT-NEXT: mov v2.b[8], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #40]
; CHECK-GI-DOT-NEXT: mov v4.b[8], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9]
; CHECK-GI-DOT-NEXT: mov v5.b[8], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #25]
; CHECK-GI-DOT-NEXT: mov v3.b[8], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[8], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[9], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #9]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #25]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #41]
; CHECK-GI-DOT-NEXT: mov v2.b[9], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #41]
; CHECK-GI-DOT-NEXT: mov v4.b[9], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #10]
; CHECK-GI-DOT-NEXT: mov v5.b[9], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #26]
; CHECK-GI-DOT-NEXT: mov v3.b[9], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[9], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[10], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #10]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #26]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #42]
; CHECK-GI-DOT-NEXT: mov v2.b[10], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #42]
; CHECK-GI-DOT-NEXT: mov v4.b[10], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11]
; CHECK-GI-DOT-NEXT: mov v5.b[10], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #27]
; CHECK-GI-DOT-NEXT: mov v3.b[10], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[10], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[11], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #11]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #27]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #43]
; CHECK-GI-DOT-NEXT: mov v2.b[11], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #43]
; CHECK-GI-DOT-NEXT: mov v4.b[11], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #12]
; CHECK-GI-DOT-NEXT: mov v5.b[11], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #28]
; CHECK-GI-DOT-NEXT: mov v3.b[11], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[11], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[12], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #12]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #28]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #44]
; CHECK-GI-DOT-NEXT: mov v2.b[12], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #44]
; CHECK-GI-DOT-NEXT: mov v4.b[12], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13]
; CHECK-GI-DOT-NEXT: mov v5.b[12], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #29]
; CHECK-GI-DOT-NEXT: mov v3.b[12], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[12], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[13], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #13]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #29]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #45]
; CHECK-GI-DOT-NEXT: mov v2.b[13], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #45]
; CHECK-GI-DOT-NEXT: mov v4.b[13], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #14]
; CHECK-GI-DOT-NEXT: mov v5.b[13], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #30]
; CHECK-GI-DOT-NEXT: mov v3.b[13], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[13], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[14], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #14]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #30]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #46]
; CHECK-GI-DOT-NEXT: mov v2.b[14], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #46]
; CHECK-GI-DOT-NEXT: mov v4.b[14], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15]
; CHECK-GI-DOT-NEXT: mov v5.b[14], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #31]
; CHECK-GI-DOT-NEXT: mov v3.b[14], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[14], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[15], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #15]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #31]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #47]
; CHECK-GI-DOT-NEXT: mov v2.b[15], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #47]
; CHECK-GI-DOT-NEXT: mov v4.b[15], v7.b[0]
; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v5.b[15], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v3.b[15], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[15], v16.b[0]
; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v0.4s, v4.16b, v1.16b
; CHECK-GI-DOT-NEXT: sdot v7.4s, v5.16b, v2.16b
; CHECK-GI-DOT-NEXT: sdot v16.4s, v6.16b, v3.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v7.4s
; CHECK-GI-DOT-NEXT: addv s2, v16.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: fmov w10, s2
; CHECK-GI-DOT-NEXT: add w8, w8, w9
; CHECK-GI-DOT-NEXT: add w0, w8, w10
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <48 x i8>, ptr %p1
%b = load <48 x i8>, ptr %p2
%0 = sext <48 x i8> %a to <48 x i32>
%1 = sext <48 x i8> %b to <48 x i32>
%2 = mul nuw nsw <48 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
ret i32 %3
}
; Test to ensure that if G_MUL has more than 1 use, it should not be combined to UDOT
define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
; CHECK-BASE-LABEL: test_udot_v8i8_multi_use:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: umull v2.4s, v1.4h, v0.4h
; CHECK-BASE-NEXT: mov v3.16b, v2.16b
; CHECK-BASE-NEXT: fmov w8, s2
; CHECK-BASE-NEXT: umlal2 v3.4s, v1.8h, v0.8h
; CHECK-BASE-NEXT: addv s0, v3.4s
; CHECK-BASE-NEXT: fmov w9, s0
; CHECK-BASE-NEXT: add w0, w9, w8
; CHECK-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: umull v0.4s, v1.4h, v0.4h
; CHECK-SD-DOT-NEXT: addp v2.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: fmov w8, s2
; CHECK-SD-DOT-NEXT: add w0, w8, w9
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v8i8_multi_use:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: umull v2.4s, v1.4h, v0.4h
; CHECK-GI-DOT-NEXT: mov v3.16b, v2.16b
; CHECK-GI-DOT-NEXT: fmov w8, s2
; CHECK-GI-DOT-NEXT: umlal2 v3.4s, v1.8h, v0.8h
; CHECK-GI-DOT-NEXT: addv s0, v3.4s
; CHECK-GI-DOT-NEXT: fmov w9, s0
; CHECK-GI-DOT-NEXT: add w0, w9, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = zext <8 x i8> %a to <8 x i32>
%1 = zext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
%4 = extractelement <8 x i32> %2, i32 0
%5 = add nuw nsw i32 %3, %4
ret i32 %5
}
define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i16:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i16:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i16:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: addv h1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i16:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: addv h1, v1.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
%z = add i16 %z1, %z2
ret i16 %z
}
define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: ushll v5.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: ushll v7.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-BASE-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: ushll v5.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: ushll v7.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-DOT-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = zext <8 x i16> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: sshll v5.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: sshll v7.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-BASE-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: sshll v5.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: sshll v7.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-DOT-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = sext <8 x i16> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-BASE-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-DOT-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i16> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-BASE-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-DOT-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i16> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d2, #0x00ffff0000ffff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-BASE-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d2, #0x00ffff0000ffff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v2.2d, #0x0000000000ffff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0x0000000000ffff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i16> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-BASE-NEXT: shl v0.2d, v0.2d, #48
; CHECK-SD-BASE-NEXT: shl v1.2d, v1.2d, #48
; CHECK-SD-BASE-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-SD-BASE-NEXT: ssra v0.2d, v1.2d, #48
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-DOT-NEXT: shl v0.2d, v0.2d, #48
; CHECK-SD-DOT-NEXT: shl v1.2d, v1.2d, #48
; CHECK-SD-DOT-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-SD-DOT-NEXT: ssra v0.2d, v1.2d, #48
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: shl v0.2d, v0.2d, #48
; CHECK-GI-BASE-NEXT: shl v1.2d, v1.2d, #48
; CHECK-GI-BASE-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-GI-BASE-NEXT: sshr v1.2d, v1.2d, #48
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: shl v0.2d, v0.2d, #48
; CHECK-GI-DOT-NEXT: shl v1.2d, v1.2d, #48
; CHECK-GI-DOT-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-GI-DOT-NEXT: sshr v1.2d, v1.2d, #48
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i16> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v0.8h, v2.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v2.4h
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v1.8h, v3.8h
; CHECK-SD-BASE-NEXT: uaddl v1.4s, v1.4h, v3.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: ushll v5.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v6.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: ushll v7.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v2.4s, v4.4s, v2.8h
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v5.4s, v0.8h
; CHECK-GI-BASE-NEXT: uaddw2 v3.4s, v6.4s, v3.8h
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v7.4s, v1.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-GI-BASE-NEXT: add v1.4s, v3.4s, v1.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addv s0, v4.4s
; CHECK-GI-DOT-NEXT: addv s1, v3.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%yy = zext <16 x i8> %y to <16 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v0.8h, v2.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v2.4h
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v1.8h, v3.8h
; CHECK-SD-BASE-NEXT: saddl v1.4s, v1.4h, v3.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: sshll v5.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v6.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: sshll v7.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v2.4s, v4.4s, v2.8h
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v5.4s, v0.8h
; CHECK-GI-BASE-NEXT: saddw2 v3.4s, v6.4s, v3.8h
; CHECK-GI-BASE-NEXT: saddw2 v1.4s, v7.4s, v1.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-GI-BASE-NEXT: add v1.4s, v3.4s, v1.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addv s0, v4.4s
; CHECK-GI-DOT-NEXT: addv s1, v3.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%yy = sext <16 x i8> %y to <16 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v3.8b, #1
; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v3.8b
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v4.2s, v0.8b, v2.8b
; CHECK-GI-DOT-NEXT: udot v3.2s, v1.8b, v2.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v3.2s, v3.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = zext <8 x i8> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: sadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v3.8b, #1
; CHECK-SD-DOT-NEXT: sdot v2.2s, v1.8b, v3.8b
; CHECK-SD-DOT-NEXT: sdot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-BASE-NEXT: saddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v4.2s, v0.8b, v2.8b
; CHECK-GI-DOT-NEXT: sdot v3.2s, v1.8b, v2.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v3.2s, v3.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = sext <8 x i8> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v2.2d, #0x0000ff000000ff
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0x0000ff000000ff
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = zext <4 x i8> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: shl v0.4s, v0.4s, #24
; CHECK-SD-BASE-NEXT: shl v1.4s, v1.4s, #24
; CHECK-SD-BASE-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-SD-BASE-NEXT: ssra v0.4s, v1.4s, #24
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: shl v0.4s, v0.4s, #24
; CHECK-SD-DOT-NEXT: shl v1.4s, v1.4s, #24
; CHECK-SD-DOT-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-SD-DOT-NEXT: ssra v0.4s, v1.4s, #24
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: shl v0.4s, v0.4s, #24
; CHECK-GI-BASE-NEXT: shl v1.4s, v1.4s, #24
; CHECK-GI-BASE-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-GI-BASE-NEXT: sshr v1.4s, v1.4s, #24
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: shl v0.4s, v0.4s, #24
; CHECK-GI-DOT-NEXT: shl v1.4s, v1.4s, #24
; CHECK-GI-DOT-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-GI-DOT-NEXT: sshr v1.4s, v1.4s, #24
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = sext <4 x i8> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlp v1.8h, v1.16b
; CHECK-SD-BASE-NEXT: uadalp v1.8h, v0.16b
; CHECK-SD-BASE-NEXT: addv h0, v1.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlp v1.8h, v1.16b
; CHECK-SD-DOT-NEXT: uadalp v1.8h, v0.16b
; CHECK-SD-DOT-NEXT: addv h0, v1.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.8h, v2.8h, v0.16b
; CHECK-GI-BASE-NEXT: uaddw2 v1.8h, v3.8h, v1.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: addv h1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.8h, v2.8h, v0.16b
; CHECK-GI-DOT-NEXT: uaddw2 v1.8h, v3.8h, v1.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: addv h1, v1.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%yy = zext <16 x i8> %y to <16 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlp v1.8h, v1.16b
; CHECK-SD-BASE-NEXT: sadalp v1.8h, v0.16b
; CHECK-SD-BASE-NEXT: addv h0, v1.8h
; CHECK-SD-BASE-NEXT: smov w0, v0.h[0]
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlp v1.8h, v1.16b
; CHECK-SD-DOT-NEXT: sadalp v1.8h, v0.16b
; CHECK-SD-DOT-NEXT: addv h0, v1.8h
; CHECK-SD-DOT-NEXT: smov w0, v0.h[0]
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.8h, v2.8h, v0.16b
; CHECK-GI-BASE-NEXT: saddw2 v1.8h, v3.8h, v1.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: addv h1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxth
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.8h, v2.8h, v0.16b
; CHECK-GI-DOT-NEXT: saddw2 v1.8h, v3.8h, v1.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: addv h1, v1.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxth
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%yy = sext <16 x i8> %y to <16 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: addv h1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: addv h1, v1.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%yy = zext <8 x i8> %y to <8 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: smov w0, v0.h[0]
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: smov w0, v0.h[0]
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: addv h1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxth
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: addv h1, v1.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxth
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%yy = sext <8 x i8> %y to <8 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: add v0.16b, v0.16b, v1.16b
; CHECK-SD-BASE-NEXT: addv b0, v0.16b
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: add v0.16b, v0.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv b0, v0.16b
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv b0, v0.16b
; CHECK-GI-BASE-NEXT: addv b1, v1.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxtb
; CHECK-GI-BASE-NEXT: and w0, w8, #0xff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv b0, v0.16b
; CHECK-GI-DOT-NEXT: addv b1, v1.16b
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxtb
; CHECK-GI-DOT-NEXT: and w0, w8, #0xff
; CHECK-GI-DOT-NEXT: ret
entry:
%z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
%z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
%z = add i8 %z1, %z2
ret i8 %z
}
define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-SD-BASE-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-SD-BASE-NEXT: ushll2 v5.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll2 v6.4s, v3.8h, #0
; CHECK-SD-BASE-NEXT: ushll2 v7.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: ushll v3.4s, v3.4h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-BASE-NEXT: uaddl v2.2d, v5.2s, v2.2s
; CHECK-SD-BASE-NEXT: uaddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v4.2s
; CHECK-SD-BASE-NEXT: uaddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-BASE-NEXT: uaddl v6.2d, v7.2s, v6.2s
; CHECK-SD-BASE-NEXT: uaddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v3.8h, v1.16b, #0
; CHECK-SD-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-SD-DOT-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-SD-DOT-NEXT: ushll2 v5.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll2 v6.4s, v3.8h, #0
; CHECK-SD-DOT-NEXT: ushll2 v7.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: ushll v3.4s, v3.4h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-DOT-NEXT: uaddl v2.2d, v5.2s, v2.2s
; CHECK-SD-DOT-NEXT: uaddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v4.2s
; CHECK-SD-DOT-NEXT: uaddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-DOT-NEXT: uaddl v6.2d, v7.2s, v6.2s
; CHECK-SD-DOT-NEXT: uaddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-GI-BASE-NEXT: ushll v5.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v6.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-GI-BASE-NEXT: ushll v7.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v16.2d, v4.2s, #0
; CHECK-GI-BASE-NEXT: ushll v17.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: ushll v18.2d, v5.2s, #0
; CHECK-GI-BASE-NEXT: ushll v19.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v20.2d, v6.2s, #0
; CHECK-GI-BASE-NEXT: ushll v21.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: ushll v22.2d, v7.2s, #0
; CHECK-GI-BASE-NEXT: ushll v23.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v4.2d, v16.2d, v4.4s
; CHECK-GI-BASE-NEXT: uaddw2 v2.2d, v17.2d, v2.4s
; CHECK-GI-BASE-NEXT: uaddw2 v5.2d, v18.2d, v5.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v19.2d, v0.4s
; CHECK-GI-BASE-NEXT: uaddw2 v6.2d, v20.2d, v6.4s
; CHECK-GI-BASE-NEXT: uaddw2 v3.2d, v21.2d, v3.4s
; CHECK-GI-BASE-NEXT: uaddw2 v7.2d, v22.2d, v7.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v23.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v2.2d, v4.2d, v2.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v3.2d, v6.2d, v3.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v7.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-DOT-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-GI-DOT-NEXT: ushll v5.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v6.4s, v3.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-GI-DOT-NEXT: ushll v7.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v16.2d, v4.2s, #0
; CHECK-GI-DOT-NEXT: ushll v17.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: ushll v18.2d, v5.2s, #0
; CHECK-GI-DOT-NEXT: ushll v19.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v20.2d, v6.2s, #0
; CHECK-GI-DOT-NEXT: ushll v21.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: ushll v22.2d, v7.2s, #0
; CHECK-GI-DOT-NEXT: ushll v23.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v4.2d, v16.2d, v4.4s
; CHECK-GI-DOT-NEXT: uaddw2 v2.2d, v17.2d, v2.4s
; CHECK-GI-DOT-NEXT: uaddw2 v5.2d, v18.2d, v5.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v19.2d, v0.4s
; CHECK-GI-DOT-NEXT: uaddw2 v6.2d, v20.2d, v6.4s
; CHECK-GI-DOT-NEXT: uaddw2 v3.2d, v21.2d, v3.4s
; CHECK-GI-DOT-NEXT: uaddw2 v7.2d, v22.2d, v7.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v23.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v2.2d, v4.2d, v2.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v3.2d, v6.2d, v3.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v7.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%yy = zext <16 x i8> %y to <16 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-SD-BASE-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-SD-BASE-NEXT: sshll2 v5.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: sshll2 v6.4s, v3.8h, #0
; CHECK-SD-BASE-NEXT: sshll2 v7.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: sshll v3.4s, v3.4h, #0
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-BASE-NEXT: saddl v2.2d, v5.2s, v2.2s
; CHECK-SD-BASE-NEXT: saddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v4.2s
; CHECK-SD-BASE-NEXT: saddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-BASE-NEXT: saddl v6.2d, v7.2s, v6.2s
; CHECK-SD-BASE-NEXT: saddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v3.8h, v1.16b, #0
; CHECK-SD-DOT-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-SD-DOT-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-SD-DOT-NEXT: sshll2 v5.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: sshll2 v6.4s, v3.8h, #0
; CHECK-SD-DOT-NEXT: sshll2 v7.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: sshll v3.4s, v3.4h, #0
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-DOT-NEXT: saddl v2.2d, v5.2s, v2.2s
; CHECK-SD-DOT-NEXT: saddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v4.2s
; CHECK-SD-DOT-NEXT: saddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-DOT-NEXT: saddl v6.2d, v7.2s, v6.2s
; CHECK-SD-DOT-NEXT: saddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-GI-BASE-NEXT: sshll v5.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v6.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v3.4s, v3.8h, #0
; CHECK-GI-BASE-NEXT: sshll v7.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: sshll v16.2d, v4.2s, #0
; CHECK-GI-BASE-NEXT: sshll v17.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: sshll v18.2d, v5.2s, #0
; CHECK-GI-BASE-NEXT: sshll v19.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v20.2d, v6.2s, #0
; CHECK-GI-BASE-NEXT: sshll v21.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: sshll v22.2d, v7.2s, #0
; CHECK-GI-BASE-NEXT: sshll v23.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v4.2d, v16.2d, v4.4s
; CHECK-GI-BASE-NEXT: saddw2 v2.2d, v17.2d, v2.4s
; CHECK-GI-BASE-NEXT: saddw2 v5.2d, v18.2d, v5.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v19.2d, v0.4s
; CHECK-GI-BASE-NEXT: saddw2 v6.2d, v20.2d, v6.4s
; CHECK-GI-BASE-NEXT: saddw2 v3.2d, v21.2d, v3.4s
; CHECK-GI-BASE-NEXT: saddw2 v7.2d, v22.2d, v7.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v23.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v2.2d, v4.2d, v2.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v3.2d, v6.2d, v3.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v7.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-DOT-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-GI-DOT-NEXT: sshll v5.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v6.4s, v3.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v3.4s, v3.8h, #0
; CHECK-GI-DOT-NEXT: sshll v7.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: sshll v16.2d, v4.2s, #0
; CHECK-GI-DOT-NEXT: sshll v17.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: sshll v18.2d, v5.2s, #0
; CHECK-GI-DOT-NEXT: sshll v19.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v20.2d, v6.2s, #0
; CHECK-GI-DOT-NEXT: sshll v21.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: sshll v22.2d, v7.2s, #0
; CHECK-GI-DOT-NEXT: sshll v23.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v4.2d, v16.2d, v4.4s
; CHECK-GI-DOT-NEXT: saddw2 v2.2d, v17.2d, v2.4s
; CHECK-GI-DOT-NEXT: saddw2 v5.2d, v18.2d, v5.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v19.2d, v0.4s
; CHECK-GI-DOT-NEXT: saddw2 v6.2d, v20.2d, v6.4s
; CHECK-GI-DOT-NEXT: saddw2 v3.2d, v21.2d, v3.4s
; CHECK-GI-DOT-NEXT: saddw2 v7.2d, v22.2d, v7.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v23.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v2.2d, v4.2d, v2.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v3.2d, v6.2d, v3.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v7.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%yy = sext <16 x i8> %y to <16 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: ushll v5.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: ushll v7.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-BASE-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: ushll v5.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: ushll v7.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-DOT-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = zext <8 x i8> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: sshll v5.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: sshll v7.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-BASE-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: sshll v5.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: sshll v7.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-DOT-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = sext <8 x i8> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-BASE-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-DOT-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: ushll v4.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-GI-BASE-NEXT: and v3.16b, v3.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v4.16b, v4.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-BASE-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v4.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: ushll v4.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-GI-DOT-NEXT: and v3.16b, v3.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v4.16b, v4.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v4.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i8> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-SD-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-BASE-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-SD-BASE-NEXT: shl v2.2d, v2.2d, #56
; CHECK-SD-BASE-NEXT: shl v3.2d, v3.2d, #56
; CHECK-SD-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: sshr v2.2d, v2.2d, #56
; CHECK-SD-BASE-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-SD-BASE-NEXT: ssra v2.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: ssra v3.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: add v0.2d, v2.2d, v3.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-SD-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-DOT-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-SD-DOT-NEXT: shl v2.2d, v2.2d, #56
; CHECK-SD-DOT-NEXT: shl v3.2d, v3.2d, #56
; CHECK-SD-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: sshr v2.2d, v2.2d, #56
; CHECK-SD-DOT-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-SD-DOT-NEXT: ssra v2.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: ssra v3.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: add v0.2d, v2.2d, v3.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v2.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: ushll2 v3.2d, v1.4s, #0
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: shl v2.2d, v2.2d, #56
; CHECK-GI-BASE-NEXT: shl v3.2d, v3.2d, #56
; CHECK-GI-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: sshr v2.2d, v2.2d, #56
; CHECK-GI-BASE-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-GI-BASE-NEXT: ssra v2.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: ssra v3.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: addp d0, v2.2d
; CHECK-GI-BASE-NEXT: addp d1, v3.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v2.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: ushll2 v3.2d, v1.4s, #0
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: shl v2.2d, v2.2d, #56
; CHECK-GI-DOT-NEXT: shl v3.2d, v3.2d, #56
; CHECK-GI-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: sshr v2.2d, v2.2d, #56
; CHECK-GI-DOT-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-GI-DOT-NEXT: ssra v2.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: ssra v3.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: addp d0, v2.2d
; CHECK-GI-DOT-NEXT: addp d1, v3.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i8> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d2, #0x0000ff000000ff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-BASE-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d2, #0x0000ff000000ff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i8> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: ssra v0.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: ssra v0.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i8> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v3.8h, v3.8b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: saddlp v3.4s, v3.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: sadalp v3.4s, v2.8h
; CHECK-SD-BASE-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v5.8b, #1
; CHECK-SD-DOT-NEXT: movi v6.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v6.2s, v1.8b, v5.8b
; CHECK-SD-DOT-NEXT: sdot v4.2s, v3.8b, v5.8b
; CHECK-SD-DOT-NEXT: udot v6.2s, v0.8b, v5.8b
; CHECK-SD-DOT-NEXT: sdot v4.2s, v2.8b, v5.8b
; CHECK-SD-DOT-NEXT: add v0.2s, v6.2s, v4.2s
; CHECK-SD-DOT-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v3.8b, #0
; CHECK-GI-BASE-NEXT: ushll v4.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v5.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll v6.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: sshll v7.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v4.4s, v0.8h
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v5.4s, v1.8h
; CHECK-GI-BASE-NEXT: saddw2 v2.4s, v6.4s, v2.8h
; CHECK-GI-BASE-NEXT: saddw2 v3.4s, v7.4s, v3.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: addv s2, v2.4s
; CHECK-GI-BASE-NEXT: addv s3, v3.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: fmov w10, s2
; CHECK-GI-BASE-NEXT: fmov w11, s3
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: add w9, w10, w11
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v4.8b, #1
; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v6.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v5.2s, v0.8b, v4.8b
; CHECK-GI-DOT-NEXT: sdot v6.2s, v3.8b, v4.8b
; CHECK-GI-DOT-NEXT: udot v7.2s, v1.8b, v4.8b
; CHECK-GI-DOT-NEXT: sdot v16.2s, v2.8b, v4.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v5.2s, v5.2s
; CHECK-GI-DOT-NEXT: addp v3.2s, v6.2s, v6.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v7.2s, v7.2s
; CHECK-GI-DOT-NEXT: addp v2.2s, v16.2s, v16.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w11, s3
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: fmov w10, s2
; CHECK-GI-DOT-NEXT: add w8, w8, w9
; CHECK-GI-DOT-NEXT: add w9, w10, w11
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%axx = zext <8 x i8> %ax to <8 x i32>
%az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
%ayy = zext <8 x i8> %ay to <8 x i32>
%az2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %ayy)
%az = add i32 %az1, %az2
%bxx = sext <8 x i8> %bx to <8 x i32>
%bz1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bxx)
%byy = sext <8 x i8> %by to <8 x i32>
%bz2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %byy)
%bz = add i32 %bz1, %bz2
%z = add i32 %az, %bz
ret i32 %z
}
define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: uaddlp v3.4s, v3.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: uadalp v3.4s, v2.8h
; CHECK-SD-BASE-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-DOT-NEXT: uaddlp v3.4s, v3.8h
; CHECK-SD-DOT-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-DOT-NEXT: uadalp v3.4s, v2.8h
; CHECK-SD-DOT-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v4.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v5.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v6.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-GI-BASE-NEXT: ushll v7.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-GI-BASE-NEXT: add v0.4s, v4.4s, v0.4s
; CHECK-GI-BASE-NEXT: add v1.4s, v5.4s, v1.4s
; CHECK-GI-BASE-NEXT: add v2.4s, v6.4s, v2.4s
; CHECK-GI-BASE-NEXT: add v3.4s, v7.4s, v3.4s
; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-BASE-NEXT: add v1.4s, v2.4s, v3.4s
; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v4.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v5.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v6.4s, v2.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-GI-DOT-NEXT: ushll v7.4s, v3.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-GI-DOT-NEXT: add v0.4s, v4.4s, v0.4s
; CHECK-GI-DOT-NEXT: add v1.4s, v5.4s, v1.4s
; CHECK-GI-DOT-NEXT: add v2.4s, v6.4s, v2.4s
; CHECK-GI-DOT-NEXT: add v3.4s, v7.4s, v3.4s
; CHECK-GI-DOT-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-DOT-NEXT: add v1.4s, v2.4s, v3.4s
; CHECK-GI-DOT-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%axx = zext <8 x i16> %ax to <8 x i32>
%s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s1l = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%axs = add <4 x i32> %s1h, %s1l
%ayy = zext <8 x i16> %ay to <8 x i32>
%s2h = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s2l = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%ays = add <4 x i32> %s2h, %s2l
%az = add <4 x i32> %axs, %ays
%bxx = zext <8 x i16> %bx to <8 x i32>
%s3h = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s3l = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%bxs = add <4 x i32> %s3h, %s3l
%byy = zext <8 x i16> %by to <8 x i32>
%s4h = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s4l = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%bys = add <4 x i32> %s4h, %s4l
%bz = add <4 x i32> %bxs, %bys
%z = add <4 x i32> %az, %bz
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z)
ret i32 %z2
}
define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i64_v2i64:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i64_v2i64:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i64_v2i64:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i64_v2i64:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
; CHECK-BASE-LABEL: full:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ldr d0, [x2]
; CHECK-BASE-NEXT: ldr d1, [x0]
; CHECK-BASE-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-BASE-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-BASE-NEXT: sxtw x8, w3
; CHECK-BASE-NEXT: sxtw x9, w1
; CHECK-BASE-NEXT: uabdl v0.8h, v1.8b, v0.8b
; CHECK-BASE-NEXT: add x11, x2, x8
; CHECK-BASE-NEXT: add x10, x0, x9
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: add x11, x11, x8
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: add x10, x10, x9
; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: add x11, x11, x8
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: add x10, x10, x9
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: add x11, x11, x8
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: add x10, x10, x9
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: add x11, x11, x8
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: add x10, x10, x9
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: add x11, x11, x8
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: add x10, x10, x9
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11, x8]
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10, x9]
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: full:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: ldr d0, [x0]
; CHECK-DOT-NEXT: ldr d1, [x2]
; CHECK-DOT-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-DOT-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-DOT-NEXT: sxtw x8, w3
; CHECK-DOT-NEXT: sxtw x9, w1
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: movi v3.8b, #1
; CHECK-DOT-NEXT: uabd v0.8b, v0.8b, v1.8b
; CHECK-DOT-NEXT: add x11, x2, x8
; CHECK-DOT-NEXT: add x10, x0, x9
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: add x11, x11, x8
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: add x10, x10, x9
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: add x10, x10, x9
; CHECK-DOT-NEXT: add x11, x11, x8
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: add x10, x10, x9
; CHECK-DOT-NEXT: add x11, x11, x8
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: add x10, x10, x9
; CHECK-DOT-NEXT: add x11, x11, x8
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: add x10, x10, x9
; CHECK-DOT-NEXT: add x11, x11, x8
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10, x9]
; CHECK-DOT-NEXT: ldr d4, [x11, x8]
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
entry:
%idx.ext8 = sext i32 %s2 to i64
%idx.ext = sext i32 %s1 to i64
%0 = load <8 x i8>, ptr %p1, align 1
%1 = zext <8 x i8> %0 to <8 x i32>
%2 = load <8 x i8>, ptr %p2, align 1
%3 = zext <8 x i8> %2 to <8 x i32>
%4 = sub nsw <8 x i32> %1, %3
%5 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %4, i1 true)
%6 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
%add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext
%add.ptr9 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext8
%7 = load <8 x i8>, ptr %add.ptr, align 1
%8 = zext <8 x i8> %7 to <8 x i32>
%9 = load <8 x i8>, ptr %add.ptr9, align 1
%10 = zext <8 x i8> %9 to <8 x i32>
%11 = sub nsw <8 x i32> %8, %10
%12 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %11, i1 true)
%13 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
%op.rdx.1 = add i32 %13, %6
%add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
%add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8
%14 = load <8 x i8>, ptr %add.ptr.1, align 1
%15 = zext <8 x i8> %14 to <8 x i32>
%16 = load <8 x i8>, ptr %add.ptr9.1, align 1
%17 = zext <8 x i8> %16 to <8 x i32>
%18 = sub nsw <8 x i32> %15, %17
%19 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %18, i1 true)
%20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19)
%op.rdx.2 = add i32 %20, %op.rdx.1
%add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
%add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8
%21 = load <8 x i8>, ptr %add.ptr.2, align 1
%22 = zext <8 x i8> %21 to <8 x i32>
%23 = load <8 x i8>, ptr %add.ptr9.2, align 1
%24 = zext <8 x i8> %23 to <8 x i32>
%25 = sub nsw <8 x i32> %22, %24
%26 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %25, i1 true)
%27 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %26)
%op.rdx.3 = add i32 %27, %op.rdx.2
%add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %idx.ext
%add.ptr9.3 = getelementptr inbounds i8, ptr %add.ptr9.2, i64 %idx.ext8
%28 = load <8 x i8>, ptr %add.ptr.3, align 1
%29 = zext <8 x i8> %28 to <8 x i32>
%30 = load <8 x i8>, ptr %add.ptr9.3, align 1
%31 = zext <8 x i8> %30 to <8 x i32>
%32 = sub nsw <8 x i32> %29, %31
%33 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %32, i1 true)
%34 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %33)
%op.rdx.4 = add i32 %34, %op.rdx.3
%add.ptr.4 = getelementptr inbounds i8, ptr %add.ptr.3, i64 %idx.ext
%add.ptr9.4 = getelementptr inbounds i8, ptr %add.ptr9.3, i64 %idx.ext8
%35 = load <8 x i8>, ptr %add.ptr.4, align 1
%36 = zext <8 x i8> %35 to <8 x i32>
%37 = load <8 x i8>, ptr %add.ptr9.4, align 1
%38 = zext <8 x i8> %37 to <8 x i32>
%39 = sub nsw <8 x i32> %36, %38
%40 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %39, i1 true)
%41 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40)
%op.rdx.5 = add i32 %41, %op.rdx.4
%add.ptr.5 = getelementptr inbounds i8, ptr %add.ptr.4, i64 %idx.ext
%add.ptr9.5 = getelementptr inbounds i8, ptr %add.ptr9.4, i64 %idx.ext8
%42 = load <8 x i8>, ptr %add.ptr.5, align 1
%43 = zext <8 x i8> %42 to <8 x i32>
%44 = load <8 x i8>, ptr %add.ptr9.5, align 1
%45 = zext <8 x i8> %44 to <8 x i32>
%46 = sub nsw <8 x i32> %43, %45
%47 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %46, i1 true)
%48 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47)
%op.rdx.6 = add i32 %48, %op.rdx.5
%add.ptr.6 = getelementptr inbounds i8, ptr %add.ptr.5, i64 %idx.ext
%add.ptr9.6 = getelementptr inbounds i8, ptr %add.ptr9.5, i64 %idx.ext8
%49 = load <8 x i8>, ptr %add.ptr.6, align 1
%50 = zext <8 x i8> %49 to <8 x i32>
%51 = load <8 x i8>, ptr %add.ptr9.6, align 1
%52 = zext <8 x i8> %51 to <8 x i32>
%53 = sub nsw <8 x i32> %50, %52
%54 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %53, i1 true)
%55 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %54)
%op.rdx.7 = add i32 %55, %op.rdx.6
ret i32 %op.rdx.7
}
declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>)
declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>)
declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)