Files
clang-p2996/llvm/test/CodeGen/AArch64/highextractbitcast.ll
Harvin Iriawan db158c7c83 [AArch64] Update generic sched model to A510
Refresh of the generic scheduling model to use A510 instead of A55.
  Main benefits are to the little core, and introducing SVE scheduling information.
  Changes tested on various OoO cores, no performance degradation is seen.

  Differential Revision: https://reviews.llvm.org/D156799
2023-08-21 12:25:15 +01:00

590 lines
25 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-LE
; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE
declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2)
define <4 x i32> @test_smull_high_s16_base(<8 x i16> %a, <8 x i16> %b) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_base:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_base:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%a = bitcast <2 x i64> %aa to <8 x i16>
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_bitcastb1(<8 x i16> %a, <16 x i8> %bb) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%b = bitcast <16 x i8> %bb to <8 x i16>
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: rev64 v0.4h, v0.4h
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%s1a = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
%s1 = bitcast <1 x i64> %s1a to <4 x i16>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_bitcastb2(<8 x i16> %a, <16 x i8> %b) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.16b, v1.16b
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: rev16 v1.8b, v1.8b
; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%s2 = bitcast <8 x i8> %s2a to <4 x i16>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i16> %b) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1_wrongindex:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: ext v2.16b, v0.16b, v0.16b, #8
; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-LE-NEXT: ext v0.8b, v0.8b, v2.8b, #4
; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1_wrongindex:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #4
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%a = bitcast <2 x i64> %aa to <8 x i16>
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i8> %bb) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1_wrongindex:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-LE-NEXT: ext v1.8b, v1.8b, v2.8b, #6
; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1_wrongindex:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #6
; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%b = bitcast <16 x i8> %bb to <8 x i16>
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_bitcasta2_wrongindex(<4 x i32> %a, <8 x i16> %b) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2_wrongindex:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #4
; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2_wrongindex:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #4
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 2>
%s1 = bitcast <2 x i32> %s1a to <4 x i16>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i8> %b) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2_wrongindex:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #4
; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2_wrongindex:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.16b, v1.16b
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #4
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: rev16 v1.8b, v1.8b
; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%s2 = bitcast <8 x i8> %s2a to <4 x i16>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_splata1(<2 x i64> %aa, <8 x i16> %b) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_splata1:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: smull2 v0.4s, v1.8h, v0.h[3]
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_splata1:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: smull2 v0.4s, v1.8h, v0.h[3]
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%a = bitcast <2 x i64> %aa to <8 x i16>
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_splatb1(<8 x i16> %a, <16 x i8> %bb) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_splatb1:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.h[3]
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_splatb1:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: smull2 v0.4s, v0.8h, v1.h[3]
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%b = bitcast <16 x i8> %bb to <8 x i16>
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_splata2(<4 x i32> %a, <8 x i16> %b) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_splata2:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: dup v0.2s, v0.s[3]
; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_splata2:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: dup v0.2s, v0.s[3]
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%s1 = bitcast <2 x i32> %s1a to <4 x i16>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_smull_high_s16_splatb2(<8 x i16> %a, <16 x i8> %b) #0 {
; CHECK-LE-LABEL: test_smull_high_s16_splatb2:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-LE-NEXT: dup v1.8b, v1.b[3]
; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_smull_high_s16_splatb2:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.16b, v1.16b
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: dup v1.8b, v1.b[3]
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: rev16 v1.8b, v1.8b
; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%s2 = bitcast <8 x i8> %s2a to <4 x i16>
%r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <4 x i32> @test_umull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 {
; CHECK-LE-LABEL: test_umull_high_s16_bitcasta1:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: umull2 v0.4s, v0.8h, v1.8h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_umull_high_s16_bitcasta1:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: umull2 v0.4s, v0.8h, v1.8h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%a = bitcast <2 x i64> %aa to <8 x i16>
%s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
ret <4 x i32> %r
}
define <8 x i16> @test_vabdl_high_u82(<16 x i8> %a, <8 x i16> %bb) {
; CHECK-LE-LABEL: test_vabdl_high_u82:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: uabdl2 v0.8h, v0.16b, v1.16b
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_vabdl_high_u82:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v0.16b, v0.16b
; CHECK-BE-NEXT: rev64 v1.16b, v1.16b
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: uabdl2 v0.8h, v0.16b, v1.16b
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%b = bitcast <8 x i16> %bb to <16 x i8>
%shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
%vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
ret <8 x i16> %vmovl.i.i.i
}
define <8 x i16> @test_vabdl_high_s82(<16 x i8> %a, <8 x i16> %bb) {
; CHECK-LE-LABEL: test_vabdl_high_s82:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: sabdl2 v0.8h, v0.16b, v1.16b
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_vabdl_high_s82:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v0.16b, v0.16b
; CHECK-BE-NEXT: rev64 v1.16b, v1.16b
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: sabdl2 v0.8h, v0.16b, v1.16b
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%b = bitcast <8 x i16> %bb to <16 x i8>
%shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
%vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
ret <8 x i16> %vmovl.i.i.i
}
define <4 x i32> @test_vqdmlal_high_s16_bitcast(<4 x i32> %a, <8 x i16> %b, <16 x i8> %cc) {
; CHECK-LE-LABEL: test_vqdmlal_high_s16_bitcast:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_vqdmlal_high_s16_bitcast:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: rev64 v2.8h, v2.8h
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v2.16b, v2.16b, v2.16b, #8
; CHECK-BE-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%c = bitcast <16 x i8> %cc to <8 x i16>
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
%vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
ret <4 x i32> %vqdmlal4.i.i
}
define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
; CHECK-LE-LABEL: test_pmull_high_p8_128:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: fmov d0, x3
; CHECK-LE-NEXT: fmov d1, x1
; CHECK-LE-NEXT: pmull v0.8h, v1.8b, v0.8b
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_pmull_high_p8_128:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: fmov d0, x3
; CHECK-BE-NEXT: fmov d1, x1
; CHECK-BE-NEXT: rev64 v0.8b, v0.8b
; CHECK-BE-NEXT: rev64 v1.8b, v1.8b
; CHECK-BE-NEXT: pmull v0.8h, v1.8b, v0.8b
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%a = bitcast i128 %aa to <16 x i8>
%b = bitcast i128 %bb to <16 x i8>
%shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
ret <8 x i16> %vmull.i.i
}
define <8 x i16> @test_pmull_high_p8_64(<2 x i64> %aa, <2 x i64> %bb) {
; CHECK-LE-LABEL: test_pmull_high_p8_64:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: pmull2 v0.8h, v0.16b, v1.16b
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_pmull_high_p8_64:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v0.16b, v0.16b
; CHECK-BE-NEXT: rev64 v1.16b, v1.16b
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: pmull2 v0.8h, v0.16b, v1.16b
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%a = bitcast <2 x i64> %aa to <16 x i8>
%b = bitcast <2 x i64> %bb to <16 x i8>
%shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
ret <8 x i16> %vmull.i.i
}
define <8 x i16> @foov8i16(<16 x i8> %a1, <2 x i64> %b1) {
; CHECK-LE-LABEL: foov8i16:
; CHECK-LE: // %bb.0:
; CHECK-LE-NEXT: shrn v0.4h, v0.4s, #5
; CHECK-LE-NEXT: shrn2 v0.8h, v1.4s, #5
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: foov8i16:
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: rev64 v1.4s, v1.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: shrn v0.4h, v0.4s, #5
; CHECK-BE-NEXT: shrn2 v0.8h, v1.4s, #5
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
%a0 = bitcast <16 x i8> %a1 to <4 x i32>
%b0 = bitcast <2 x i64> %b1 to <4 x i32>
%vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
%vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16>
%vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
%vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
%1 = bitcast <4 x i16> %vshrn_low to <1 x i64>
%2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
%shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
%3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
ret <8 x i16> %3
}
define <2 x i64> @hadd32_zext_asr(<16 x i8> %src1a) {
; CHECK-LE-LABEL: hadd32_zext_asr:
; CHECK-LE: // %bb.0:
; CHECK-LE-NEXT: ushll2 v0.2d, v0.4s, #1
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: hadd32_zext_asr:
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #1
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
%src1 = bitcast <16 x i8> %src1a to <4 x i32>
%s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%zextsrc1 = zext <2 x i32> %s1 to <2 x i64>
%resulti32 = shl <2 x i64> %zextsrc1, <i64 1, i64 1>
ret <2 x i64> %resulti32
}
define <2 x i64> @test_umull_high_s16_splata1(<2 x i64> %aa, <4 x i32> %b) #0 {
; CHECK-LE-LABEL: test_umull_high_s16_splata1:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: umull2 v0.2d, v1.4s, v0.s[1]
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_umull_high_s16_splata1:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: rev64 v1.4s, v1.4s
; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: umull2 v0.2d, v1.4s, v0.s[1]
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
entry:
%a = bitcast <2 x i64> %aa to <4 x i32>
%s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%s2 = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%r = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2)
ret <2 x i64> %r
}