Files
clang-p2996/llvm/test/CodeGen/X86/vector-half-conversions.ll
Tomas Matheson 773771ba38 [CodeGen][regalloc] Don't align stack slots if the stack can't be realigned
Register allocation may spill virtual registers to the stack, which can
increase alignment requirements of the stack frame. If the the function
did not require stack realignment before register allocation, the
registers required to do so may not be reserved/available. This results
in a stack frame that requires realignment but can not be realigned.

Instead, only increase the alignment of the stack if we are still able
to realign.

The register SpillAlignment will be ignored if we can't realign, and the
backend will be responsible for emitting the correct unaligned loads and
stores. This seems to be the assumed behaviour already, e.g.
ARMBaseInstrInfo::storeRegToStackSlot and X86InstrInfo::storeRegToStackSlot
are both `canRealignStack` aware.

Differential Revision: https://reviews.llvm.org/D103602
2021-06-11 16:49:12 +01:00

1457 lines
53 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX1
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX2
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX2
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX2
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX512
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX512
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX512
;
; Half to Float
;
define float @cvt_i16_to_f32(i16 %a0) nounwind {
; ALL-LABEL: cvt_i16_to_f32:
; ALL: # %bb.0:
; ALL-NEXT: movzwl %di, %eax
; ALL-NEXT: vmovd %eax, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: retq
%1 = bitcast i16 %a0 to half
%2 = fpext half %1 to float
ret float %2
}
define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; ALL-LABEL: cvt_4i16_to_4f32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: retq
%1 = bitcast <4 x i16> %a0 to <4 x half>
%2 = fpext <4 x half> %1 to <4 x float>
ret <4 x float> %2
}
define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
; ALL-LABEL: cvt_8i16_to_4f32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x float>
ret <4 x float> %3
}
define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
; ALL-LABEL: cvt_8i16_to_8f32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps %xmm0, %ymm0
; ALL-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x float>
ret <8 x float> %2
}
define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_16i16_to_16f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvtph2ps %xmm0, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_16i16_to_16f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vcvtph2ps %xmm0, %ymm2
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %ymm1
; AVX2-NEXT: vmovaps %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: cvt_16i16_to_16f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtph2ps %ymm0, %zmm0
; AVX512-NEXT: retq
%1 = bitcast <16 x i16> %a0 to <16 x half>
%2 = fpext <16 x half> %1 to <16 x float>
ret <16 x float> %2
}
define <2 x float> @cvt_2i16_to_2f32_constrained(<2 x i16> %a0) nounwind strictfp {
; ALL-LABEL: cvt_2i16_to_2f32_constrained:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: retq
%1 = bitcast <2 x i16> %a0 to <2 x half>
%2 = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %1, metadata !"fpexcept.strict") strictfp
ret <2 x float> %2
}
declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) strictfp
define <4 x float> @cvt_4i16_to_4f32_constrained(<4 x i16> %a0) nounwind strictfp {
; ALL-LABEL: cvt_4i16_to_4f32_constrained:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: retq
%1 = bitcast <4 x i16> %a0 to <4 x half>
%2 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %1, metadata !"fpexcept.strict") strictfp
ret <4 x float> %2
}
declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) strictfp
define <8 x float> @cvt_8i16_to_8f32_constrained(<8 x i16> %a0) nounwind strictfp {
; ALL-LABEL: cvt_8i16_to_8f32_constrained:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps %xmm0, %ymm0
; ALL-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %1, metadata !"fpexcept.strict") strictfp
ret <8 x float> %2
}
declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata) strictfp
define <16 x float> @cvt_16i16_to_16f32_constrained(<16 x i16> %a0) nounwind strictfp {
; AVX1-LABEL: cvt_16i16_to_16f32_constrained:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vcvtph2ps %xmm1, %ymm1
; AVX1-NEXT: vcvtph2ps %xmm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_16i16_to_16f32_constrained:
; AVX2: # %bb.0:
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %ymm1
; AVX2-NEXT: vcvtph2ps %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: cvt_16i16_to_16f32_constrained:
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtph2ps %ymm0, %zmm0
; AVX512-NEXT: retq
%1 = bitcast <16 x i16> %a0 to <16 x half>
%2 = call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %1, metadata !"fpexcept.strict") strictfp
ret <16 x float> %2
}
declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata) strictfp
;
; Half to Float (Load)
;
define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
; ALL-LABEL: load_cvt_i16_to_f32:
; ALL: # %bb.0:
; ALL-NEXT: movzwl (%rdi), %eax
; ALL-NEXT: vmovd %eax, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: retq
%1 = load i16, i16* %a0
%2 = bitcast i16 %1 to half
%3 = fpext half %2 to float
ret float %3
}
define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
; ALL-LABEL: load_cvt_4i16_to_4f32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps (%rdi), %xmm0
; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x float>
ret <4 x float> %3
}
define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
; ALL-LABEL: load_cvt_8i16_to_4f32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps (%rdi), %xmm0
; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = bitcast <4 x i16> %2 to <4 x half>
%4 = fpext <4 x half> %3 to <4 x float>
ret <4 x float> %4
}
define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
; ALL-LABEL: load_cvt_8i16_to_8f32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps (%rdi), %ymm0
; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
%3 = fpext <8 x half> %2 to <8 x float>
ret <8 x float> %3
}
define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
; AVX1-LABEL: load_cvt_16i16_to_16f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvtph2ps (%rdi), %ymm0
; AVX1-NEXT: vcvtph2ps 16(%rdi), %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_cvt_16i16_to_16f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vcvtph2ps (%rdi), %ymm0
; AVX2-NEXT: vcvtph2ps 16(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_cvt_16i16_to_16f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtph2ps (%rdi), %zmm0
; AVX512-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a0
%2 = bitcast <16 x i16> %1 to <16 x half>
%3 = fpext <16 x half> %2 to <16 x float>
ret <16 x float> %3
}
define <4 x float> @load_cvt_4i16_to_4f32_constrained(<4 x i16>* %a0) nounwind strictfp {
; ALL-LABEL: load_cvt_4i16_to_4f32_constrained:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps (%rdi), %xmm0
; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %2, metadata !"fpexcept.strict") strictfp
ret <4 x float> %3
}
define <4 x float> @load_cvt_8i16_to_4f32_constrained(<8 x i16>* %a0) nounwind strictfp {
; ALL-LABEL: load_cvt_8i16_to_4f32_constrained:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps (%rdi), %xmm0
; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = bitcast <4 x i16> %2 to <4 x half>
%4 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %3, metadata !"fpexcept.strict") strictfp
ret <4 x float> %4
}
;
; Half to Double
;
define double @cvt_i16_to_f64(i16 %a0) nounwind {
; ALL-LABEL: cvt_i16_to_f64:
; ALL: # %bb.0:
; ALL-NEXT: movzwl %di, %eax
; ALL-NEXT: vmovd %eax, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; ALL-NEXT: retq
%1 = bitcast i16 %a0 to half
%2 = fpext half %1 to double
ret double %2
}
define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; ALL-LABEL: cvt_2i16_to_2f64:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vcvtps2pd %xmm0, %xmm0
; ALL-NEXT: retq
%1 = bitcast <2 x i16> %a0 to <2 x half>
%2 = fpext <2 x half> %1 to <2 x double>
ret <2 x double> %2
}
define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; ALL-LABEL: cvt_4i16_to_4f64:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vcvtps2pd %xmm0, %ymm0
; ALL-NEXT: retq
%1 = bitcast <4 x i16> %a0 to <4 x half>
%2 = fpext <4 x half> %1 to <4 x double>
ret <4 x double> %2
}
define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
; ALL-LABEL: cvt_8i16_to_2f64:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vcvtps2pd %xmm0, %xmm0
; ALL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%2 = bitcast <2 x i16> %1 to <2 x half>
%3 = fpext <2 x half> %2 to <2 x double>
ret <2 x double> %3
}
define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; ALL-LABEL: cvt_8i16_to_4f64:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vcvtps2pd %xmm0, %ymm0
; ALL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x double>
ret <4 x double> %3
}
define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_8f64:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvtph2ps %xmm0, %ymm1
; AVX1-NEXT: vcvtps2pd %xmm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vcvtps2pd %xmm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8i16_to_8f64:
; AVX2: # %bb.0:
; AVX2-NEXT: vcvtph2ps %xmm0, %ymm1
; AVX2-NEXT: vcvtps2pd %xmm1, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: cvt_8i16_to_8f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0
; AVX512-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x double>
ret <8 x double> %2
}
define <2 x double> @cvt_2i16_to_2f64_constrained(<2 x i16> %a0) nounwind strictfp {
; ALL-LABEL: cvt_2i16_to_2f64_constrained:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vcvtps2pd %xmm0, %xmm0
; ALL-NEXT: retq
%1 = bitcast <2 x i16> %a0 to <2 x half>
%2 = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half> %1, metadata !"fpexcept.strict") strictfp
ret <2 x double> %2
}
declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half>, metadata) strictfp
define <4 x double> @cvt_4i16_to_4f64_constrained(<4 x i16> %a0) nounwind strictfp {
; ALL-LABEL: cvt_4i16_to_4f64_constrained:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vcvtps2pd %xmm0, %ymm0
; ALL-NEXT: retq
%1 = bitcast <4 x i16> %a0 to <4 x half>
%2 = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half> %1, metadata !"fpexcept.strict") strictfp
ret <4 x double> %2
}
declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata) strictfp
define <8 x double> @cvt_8i16_to_8f64_constrained(<8 x i16> %a0) nounwind strictfp {
; AVX1-LABEL: cvt_8i16_to_8f64_constrained:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvtph2ps %xmm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vcvtps2pd %xmm1, %ymm1
; AVX1-NEXT: vcvtps2pd %xmm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8i16_to_8f64_constrained:
; AVX2: # %bb.0:
; AVX2-NEXT: vcvtph2ps %xmm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1
; AVX2-NEXT: vcvtps2pd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: cvt_8i16_to_8f64_constrained:
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0
; AVX512-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = call <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half> %1, metadata !"fpexcept.strict") strictfp
ret <8 x double> %2
}
declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half>, metadata) strictfp
;
; Half to Double (Load)
;
define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
; ALL-LABEL: load_cvt_i16_to_f64:
; ALL: # %bb.0:
; ALL-NEXT: movzwl (%rdi), %eax
; ALL-NEXT: vmovd %eax, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; ALL-NEXT: retq
%1 = load i16, i16* %a0
%2 = bitcast i16 %1 to half
%3 = fpext half %2 to double
ret double %3
}
define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
; ALL-LABEL: load_cvt_2i16_to_2f64:
; ALL: # %bb.0:
; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vcvtps2pd %xmm0, %xmm0
; ALL-NEXT: retq
%1 = load <2 x i16>, <2 x i16>* %a0
%2 = bitcast <2 x i16> %1 to <2 x half>
%3 = fpext <2 x half> %2 to <2 x double>
ret <2 x double> %3
}
define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
; ALL-LABEL: load_cvt_4i16_to_4f64:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps (%rdi), %xmm0
; ALL-NEXT: vcvtps2pd %xmm0, %ymm0
; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x double>
ret <4 x double> %3
}
define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; ALL-LABEL: load_cvt_8i16_to_4f64:
; ALL: # %bb.0:
; ALL-NEXT: vcvtph2ps (%rdi), %xmm0
; ALL-NEXT: vcvtps2pd %xmm0, %ymm0
; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = bitcast <4 x i16> %2 to <4 x half>
%4 = fpext <4 x half> %3 to <4 x double>
ret <4 x double> %4
}
define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
; AVX1-LABEL: load_cvt_8i16_to_8f64:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvtph2ps (%rdi), %ymm1
; AVX1-NEXT: vcvtps2pd %xmm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vcvtps2pd %xmm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_cvt_8i16_to_8f64:
; AVX2: # %bb.0:
; AVX2-NEXT: vcvtph2ps (%rdi), %ymm1
; AVX2-NEXT: vcvtps2pd %xmm1, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_cvt_8i16_to_8f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtph2ps (%rdi), %ymm0
; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0
; AVX512-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
%3 = fpext <8 x half> %2 to <8 x double>
ret <8 x double> %3
}
;
; Float to Half
;
define i16 @cvt_f32_to_i16(float %a0) nounwind {
; ALL-LABEL: cvt_f32_to_i16:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: # kill: def $ax killed $ax killed $eax
; ALL-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
ret i16 %2
}
define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
; ALL-LABEL: cvt_4f32_to_4i16:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
ret <4 x i16> %2
}
define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
; ALL-LABEL: cvt_4f32_to_8i16_undef:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %3
}
define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
; ALL-LABEL: cvt_4f32_to_8i16_zero:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %3
}
define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
; ALL-LABEL: cvt_8f32_to_8i16:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
ret <8 x i16> %2
}
define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
; AVX1-LABEL: cvt_16f32_to_16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; AVX1-NEXT: vcvtps2ph $4, %ymm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_16f32_to_16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; AVX2-NEXT: vcvtps2ph $4, %ymm1, %xmm1
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: cvt_16f32_to_16i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0
; AVX512-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
ret <16 x i16> %2
}
;
; Float to Half (Store)
;
define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
; ALL-LABEL: store_cvt_f32_to_i16:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: vpextrw $0, %xmm0, (%rdi)
; ALL-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
store i16 %2, i16* %a1
ret void
}
define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
; ALL-LABEL: store_cvt_4f32_to_4i16:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2ph $4, %xmm0, (%rdi)
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
store <4 x i16> %2, <4 x i16>* %a1
ret void
}
define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind {
; ALL-LABEL: store_cvt_4f32_to_8i16_undef:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: vmovaps %xmm0, (%rdi)
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i16> %3, <8 x i16>* %a1
ret void
}
define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind {
; ALL-LABEL: store_cvt_4f32_to_8i16_zero:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: vmovaps %xmm0, (%rdi)
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i16> %3, <8 x i16>* %a1
ret void
}
define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
; ALL-LABEL: store_cvt_8f32_to_8i16:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2ph $4, %ymm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
store <8 x i16> %2, <8 x i16>* %a1
ret void
}
define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwind {
; AVX1-LABEL: store_cvt_16f32_to_16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi)
; AVX1-NEXT: vcvtps2ph $4, %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_16f32_to_16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi)
; AVX2-NEXT: vcvtps2ph $4, %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: store_cvt_16f32_to_16i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtps2ph $4, %zmm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
store <16 x i16> %2, <16 x i16>* %a1
ret void
}
;
; Double to Half
;
define i16 @cvt_f64_to_i16(double %a0) nounwind {
; ALL-LABEL: cvt_f64_to_i16:
; ALL: # %bb.0:
; ALL-NEXT: jmp __truncdfhf2@PLT # TAILCALL
%1 = fptrunc double %a0 to half
%2 = bitcast half %1 to i16
ret i16 %2
}
define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
; ALL-LABEL: cvt_2f64_to_2i16:
; ALL: # %bb.0:
; ALL-NEXT: subq $40, %rsp
; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovaps (%rsp), %xmm0
; ALL-NEXT: addq $40, %rsp
; ALL-NEXT: retq
%1 = fptrunc <2 x double> %a0 to <2 x half>
%2 = bitcast <2 x half> %1 to <2 x i16>
ret <2 x i16> %2
}
define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; ALL-LABEL: cvt_4f64_to_4i16:
; ALL: # %bb.0:
; ALL-NEXT: subq $72, %rsp
; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovaps (%rsp), %xmm0
; ALL-NEXT: addq $72, %rsp
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
ret <4 x i16> %2
}
define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; ALL-LABEL: cvt_4f64_to_8i16_undef:
; ALL: # %bb.0:
; ALL-NEXT: subq $72, %rsp
; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovaps (%rsp), %xmm0
; ALL-NEXT: addq $72, %rsp
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %3
}
define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; ALL-LABEL: cvt_4f64_to_8i16_zero:
; ALL: # %bb.0:
; ALL-NEXT: subq $72, %rsp
; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: addq $72, %rsp
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %3
}
define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX1-LABEL: cvt_8f64_to_8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %rbx
; AVX1-NEXT: subq $64, %rsp
; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movzwl %ax, %r15d
; AVX1-NEXT: orl %ebx, %r15d
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movzwl %ax, %r14d
; AVX1-NEXT: orl %ebx, %r14d
; AVX1-NEXT: shlq $32, %r14
; AVX1-NEXT: orq %r15, %r14
; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: # xmm0 = mem[1,0]
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movzwl %ax, %r15d
; AVX1-NEXT: orl %ebx, %r15d
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movzwl %ax, %eax
; AVX1-NEXT: orl %ebx, %eax
; AVX1-NEXT: shlq $32, %rax
; AVX1-NEXT: orq %r15, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vmovq %r14, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: addq $64, %rsp
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r14
; AVX1-NEXT: popq %r15
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8f64_to_8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: subq $64, %rsp
; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movzwl %ax, %r15d
; AVX2-NEXT: orl %ebx, %r15d
; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movzwl %ax, %r14d
; AVX2-NEXT: orl %ebx, %r14d
; AVX2-NEXT: shlq $32, %r14
; AVX2-NEXT: orq %r15, %r14
; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
; AVX2-NEXT: # xmm0 = mem[1,0]
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movzwl %ax, %r15d
; AVX2-NEXT: orl %ebx, %r15d
; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movzwl %ax, %eax
; AVX2-NEXT: orl %ebx, %eax
; AVX2-NEXT: shlq $32, %rax
; AVX2-NEXT: orq %r15, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vmovq %r14, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: addq $64, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
; AVX512-LABEL: cvt_8f64_to_8i16:
; AVX512: # %bb.0:
; AVX512-NEXT: pushq %r15
; AVX512-NEXT: pushq %r14
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: subq $80, %rsp
; AVX512-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: shll $16, %ebx
; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movzwl %ax, %r15d
; AVX512-NEXT: orl %ebx, %r15d
; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: shll $16, %ebx
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movzwl %ax, %r14d
; AVX512-NEXT: orl %ebx, %r14d
; AVX512-NEXT: shlq $32, %r14
; AVX512-NEXT: orq %r15, %r14
; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: shll $16, %ebx
; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movzwl %ax, %r15d
; AVX512-NEXT: orl %ebx, %r15d
; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: shll $16, %ebx
; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movzwl %ax, %eax
; AVX512-NEXT: orl %ebx, %eax
; AVX512-NEXT: shlq $32, %rax
; AVX512-NEXT: orq %r15, %rax
; AVX512-NEXT: vmovq %rax, %xmm0
; AVX512-NEXT: vmovq %r14, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: addq $80, %rsp
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r14
; AVX512-NEXT: popq %r15
; AVX512-NEXT: retq
%1 = fptrunc <8 x double> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
ret <8 x i16> %2
}
;
; Double to Half (Store)
;
define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind {
; ALL-LABEL: store_cvt_f64_to_i16:
; ALL: # %bb.0:
; ALL-NEXT: pushq %rbx
; ALL-NEXT: movq %rdi, %rbx
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, (%rbx)
; ALL-NEXT: popq %rbx
; ALL-NEXT: retq
%1 = fptrunc double %a0 to half
%2 = bitcast half %1 to i16
store i16 %2, i16* %a1
ret void
}
define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind {
; ALL-LABEL: store_cvt_2f64_to_2i16:
; ALL: # %bb.0:
; ALL-NEXT: pushq %rbp
; ALL-NEXT: pushq %rbx
; ALL-NEXT: subq $24, %rsp
; ALL-NEXT: movq %rdi, %rbx
; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movl %eax, %ebp
; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, (%rbx)
; ALL-NEXT: movw %bp, 2(%rbx)
; ALL-NEXT: addq $24, %rsp
; ALL-NEXT: popq %rbx
; ALL-NEXT: popq %rbp
; ALL-NEXT: retq
%1 = fptrunc <2 x double> %a0 to <2 x half>
%2 = bitcast <2 x half> %1 to <2 x i16>
store <2 x i16> %2, <2 x i16>* %a1
ret void
}
define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
; AVX1-LABEL: store_cvt_4f64_to_4i16:
; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %rbx
; AVX1-NEXT: subq $56, %rsp
; AVX1-NEXT: movq %rdi, %rbx
; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %r14d
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %r15d
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movw %ax, 4(%rbx)
; AVX1-NEXT: movw %bp, (%rbx)
; AVX1-NEXT: movw %r15w, 6(%rbx)
; AVX1-NEXT: movw %r14w, 2(%rbx)
; AVX1-NEXT: addq $56, %rsp
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r14
; AVX1-NEXT: popq %r15
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_4f64_to_4i16:
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: subq $56, %rsp
; AVX2-NEXT: movq %rdi, %rbx
; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %r14d
; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %r15d
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movw %ax, 4(%rbx)
; AVX2-NEXT: movw %bp, (%rbx)
; AVX2-NEXT: movw %r15w, 6(%rbx)
; AVX2-NEXT: movw %r14w, 2(%rbx)
; AVX2-NEXT: addq $56, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
; AVX512-LABEL: store_cvt_4f64_to_4i16:
; AVX512: # %bb.0:
; AVX512-NEXT: pushq %rbp
; AVX512-NEXT: pushq %r15
; AVX512-NEXT: pushq %r14
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: subq $56, %rsp
; AVX512-NEXT: movq %rdi, %rbx
; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %r14d
; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %r15d
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %ebp
; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movw %ax, 4(%rbx)
; AVX512-NEXT: movw %bp, (%rbx)
; AVX512-NEXT: movw %r15w, 6(%rbx)
; AVX512-NEXT: movw %r14w, 2(%rbx)
; AVX512-NEXT: addq $56, %rsp
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r14
; AVX512-NEXT: popq %r15
; AVX512-NEXT: popq %rbp
; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
store <4 x i16> %2, <4 x i16>* %a1
ret void
}
define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind {
; ALL-LABEL: store_cvt_4f64_to_8i16_undef:
; ALL: # %bb.0:
; ALL-NEXT: pushq %rbx
; ALL-NEXT: subq $64, %rsp
; ALL-NEXT: movq %rdi, %rbx
; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovaps (%rsp), %xmm0
; ALL-NEXT: vmovaps %xmm0, (%rbx)
; ALL-NEXT: addq $64, %rsp
; ALL-NEXT: popq %rbx
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i16> %3, <8 x i16>* %a1
ret void
}
define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind {
; ALL-LABEL: store_cvt_4f64_to_8i16_zero:
; ALL: # %bb.0:
; ALL-NEXT: pushq %rbx
; ALL-NEXT: subq $64, %rsp
; ALL-NEXT: movq %rdi, %rbx
; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovaps %xmm0, (%rbx)
; ALL-NEXT: addq $64, %rsp
; ALL-NEXT: popq %rbx
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i16> %3, <8 x i16>* %a1
ret void
}
define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX1-LABEL: store_cvt_8f64_to_8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %r13
; AVX1-NEXT: pushq %r12
; AVX1-NEXT: pushq %rbx
; AVX1-NEXT: subq $120, %rsp
; AVX1-NEXT: movq %rdi, %rbx
; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: # xmm0 = mem[1,0]
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %r12d
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %r13d
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %r14d
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movl %eax, %r15d
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2@PLT
; AVX1-NEXT: movw %ax, 12(%rbx)
; AVX1-NEXT: movw %r15w, 8(%rbx)
; AVX1-NEXT: movw %r14w, 4(%rbx)
; AVX1-NEXT: movw %bp, (%rbx)
; AVX1-NEXT: movw %r13w, 14(%rbx)
; AVX1-NEXT: movw %r12w, 10(%rbx)
; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
; AVX1-NEXT: movw %ax, 6(%rbx)
; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
; AVX1-NEXT: movw %ax, 2(%rbx)
; AVX1-NEXT: addq $120, %rsp
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r12
; AVX1-NEXT: popq %r13
; AVX1-NEXT: popq %r14
; AVX1-NEXT: popq %r15
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_8f64_to_8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: subq $120, %rsp
; AVX2-NEXT: movq %rdi, %rbx
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX2-NEXT: # xmm0 = mem[1,0]
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %r12d
; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %r13d
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %r14d
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movl %eax, %r15d
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2@PLT
; AVX2-NEXT: movw %ax, 12(%rbx)
; AVX2-NEXT: movw %r15w, 8(%rbx)
; AVX2-NEXT: movw %r14w, 4(%rbx)
; AVX2-NEXT: movw %bp, (%rbx)
; AVX2-NEXT: movw %r13w, 14(%rbx)
; AVX2-NEXT: movw %r12w, 10(%rbx)
; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
; AVX2-NEXT: movw %ax, 6(%rbx)
; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
; AVX2-NEXT: movw %ax, 2(%rbx)
; AVX2-NEXT: addq $120, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
; AVX512-LABEL: store_cvt_8f64_to_8i16:
; AVX512: # %bb.0:
; AVX512-NEXT: pushq %rbp
; AVX512-NEXT: pushq %r15
; AVX512-NEXT: pushq %r14
; AVX512-NEXT: pushq %r13
; AVX512-NEXT: pushq %r12
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: subq $152, %rsp
; AVX512-NEXT: movq %rdi, %rbx
; AVX512-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %r12d
; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %r13d
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %ebp
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %r14d
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movl %eax, %r15d
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: movw %ax, 12(%rbx)
; AVX512-NEXT: movw %r15w, 8(%rbx)
; AVX512-NEXT: movw %r14w, 4(%rbx)
; AVX512-NEXT: movw %bp, (%rbx)
; AVX512-NEXT: movw %r13w, 14(%rbx)
; AVX512-NEXT: movw %r12w, 10(%rbx)
; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
; AVX512-NEXT: movw %ax, 6(%rbx)
; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
; AVX512-NEXT: movw %ax, 2(%rbx)
; AVX512-NEXT: addq $152, %rsp
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r12
; AVX512-NEXT: popq %r13
; AVX512-NEXT: popq %r14
; AVX512-NEXT: popq %r15
; AVX512-NEXT: popq %rbp
; AVX512-NEXT: retq
%1 = fptrunc <8 x double> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
store <8 x i16> %2, <8 x i16>* %a1
ret void
}
define void @store_cvt_32f32_to_32f16(<32 x float> %a0, <32 x half>* %a1) nounwind {
; AVX1-LABEL: store_cvt_32f32_to_32f16:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvtps2ph $4, %ymm3, 48(%rdi)
; AVX1-NEXT: vcvtps2ph $4, %ymm2, 32(%rdi)
; AVX1-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi)
; AVX1-NEXT: vcvtps2ph $4, %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_32f32_to_32f16:
; AVX2: # %bb.0:
; AVX2-NEXT: vcvtps2ph $4, %ymm3, 48(%rdi)
; AVX2-NEXT: vcvtps2ph $4, %ymm2, 32(%rdi)
; AVX2-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi)
; AVX2-NEXT: vcvtps2ph $4, %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: store_cvt_32f32_to_32f16:
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtps2ph $4, %zmm1, 32(%rdi)
; AVX512-NEXT: vcvtps2ph $4, %zmm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = fptrunc <32 x float> %a0 to <32 x half>
store <32 x half> %1, <32 x half>* %a1
ret void
}