Files
clang-p2996/llvm/test/CodeGen/X86/fp-roundeven.ll
Freddy Ye 902ec6142a [X86][ISel] Lowering FROUND(f16) and FROUNDEVEN(f16)
When AVX512FP16 is enabled, FROUND(f16) cannot be dealt with
TypeLegalize, and no libcall in libm is ready for fround(f16) now.
FROUNDEVEN(f16) has related instruction in AVX512FP16.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D110312
2021-09-27 13:35:03 +08:00

475 lines
20 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512FP16
define half @roundeven_f16(half %h) {
; SSE2-LABEL: roundeven_f16:
; SSE2: ## %bb.0: ## %entry
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: .cfi_def_cfa_offset 16
; SSE2-NEXT: movzwl %di, %edi
; SSE2-NEXT: callq ___extendhfsf2
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: callq ___truncsfhf2
; SSE2-NEXT: popq %rcx
; SSE2-NEXT: retq
;
; SSE41-LABEL: roundeven_f16:
; SSE41: ## %bb.0: ## %entry
; SSE41-NEXT: pushq %rax
; SSE41-NEXT: .cfi_def_cfa_offset 16
; SSE41-NEXT: movzwl %di, %edi
; SSE41-NEXT: callq ___extendhfsf2
; SSE41-NEXT: roundss $8, %xmm0, %xmm0
; SSE41-NEXT: callq ___truncsfhf2
; SSE41-NEXT: popq %rcx
; SSE41-NEXT: retq
;
; AVX1-LABEL: roundeven_f16:
; AVX1: ## %bb.0: ## %entry
; AVX1-NEXT: pushq %rax
; AVX1-NEXT: .cfi_def_cfa_offset 16
; AVX1-NEXT: movzwl %di, %edi
; AVX1-NEXT: callq ___extendhfsf2
; AVX1-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0
; AVX1-NEXT: callq ___truncsfhf2
; AVX1-NEXT: popq %rcx
; AVX1-NEXT: retq
;
; AVX512F-LABEL: roundeven_f16:
; AVX512F: ## %bb.0: ## %entry
; AVX512F-NEXT: movzwl %di, %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512F-NEXT: retq
;
; AVX512FP16-LABEL: roundeven_f16:
; AVX512FP16: ## %bb.0: ## %entry
; AVX512FP16-NEXT: vrndscalesh $8, %xmm0, %xmm0, %xmm0
; AVX512FP16-NEXT: retq
entry:
%a = call half @llvm.roundeven.f16(half %h)
ret half %a
}
define float @roundeven_f32(float %x) {
; SSE2-LABEL: roundeven_f32:
; SSE2: ## %bb.0:
; SSE2-NEXT: jmp _roundevenf ## TAILCALL
;
; SSE41-LABEL: roundeven_f32:
; SSE41: ## %bb.0:
; SSE41-NEXT: roundss $8, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: roundeven_f32:
; AVX: ## %bb.0:
; AVX-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = call float @llvm.roundeven.f32(float %x)
ret float %a
}
define double @roundeven_f64(double %x) {
; SSE2-LABEL: roundeven_f64:
; SSE2: ## %bb.0:
; SSE2-NEXT: jmp _roundeven ## TAILCALL
;
; SSE41-LABEL: roundeven_f64:
; SSE41: ## %bb.0:
; SSE41-NEXT: roundsd $8, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: roundeven_f64:
; AVX: ## %bb.0:
; AVX-NEXT: vroundsd $8, %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = call double @llvm.roundeven.f64(double %x)
ret double %a
}
define <4 x float> @roundeven_v4f32(<4 x float> %x) {
; SSE2-LABEL: roundeven_v4f32:
; SSE2: ## %bb.0:
; SSE2-NEXT: subq $56, %rsp
; SSE2-NEXT: .cfi_def_cfa_offset 64
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: addq $56, %rsp
; SSE2-NEXT: retq
;
; SSE41-LABEL: roundeven_v4f32:
; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $8, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: roundeven_v4f32:
; AVX: ## %bb.0:
; AVX-NEXT: vroundps $8, %xmm0, %xmm0
; AVX-NEXT: retq
%a = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x)
ret <4 x float> %a
}
define <2 x double> @roundeven_v2f64(<2 x double> %x) {
; SSE2-LABEL: roundeven_v2f64:
; SSE2: ## %bb.0:
; SSE2-NEXT: subq $40, %rsp
; SSE2-NEXT: .cfi_def_cfa_offset 48
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: addq $40, %rsp
; SSE2-NEXT: retq
;
; SSE41-LABEL: roundeven_v2f64:
; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $8, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: roundeven_v2f64:
; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $8, %xmm0, %xmm0
; AVX-NEXT: retq
%a = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x)
ret <2 x double> %a
}
define <8 x float> @roundeven_v8f32(<8 x float> %x) {
; SSE2-LABEL: roundeven_v8f32:
; SSE2: ## %bb.0:
; SSE2-NEXT: subq $72, %rsp
; SSE2-NEXT: .cfi_def_cfa_offset 80
; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: addq $72, %rsp
; SSE2-NEXT: retq
;
; SSE41-LABEL: roundeven_v8f32:
; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $8, %xmm0, %xmm0
; SSE41-NEXT: roundps $8, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: roundeven_v8f32:
; AVX: ## %bb.0:
; AVX-NEXT: vroundps $8, %ymm0, %ymm0
; AVX-NEXT: retq
%a = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %x)
ret <8 x float> %a
}
define <4 x double> @roundeven_v4f64(<4 x double> %x) {
; SSE2-LABEL: roundeven_v4f64:
; SSE2: ## %bb.0:
; SSE2-NEXT: subq $56, %rsp
; SSE2-NEXT: .cfi_def_cfa_offset 64
; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: addq $56, %rsp
; SSE2-NEXT: retq
;
; SSE41-LABEL: roundeven_v4f64:
; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $8, %xmm0, %xmm0
; SSE41-NEXT: roundpd $8, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: roundeven_v4f64:
; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $8, %ymm0, %ymm0
; AVX-NEXT: retq
%a = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %x)
ret <4 x double> %a
}
define <16 x float> @roundeven_v16f32(<16 x float> %x) {
; SSE2-LABEL: roundeven_v16f32:
; SSE2: ## %bb.0:
; SSE2-NEXT: subq $104, %rsp
; SSE2-NEXT: .cfi_def_cfa_offset 112
; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload
; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0]
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload
; SSE2-NEXT: addq $104, %rsp
; SSE2-NEXT: retq
;
; SSE41-LABEL: roundeven_v16f32:
; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $8, %xmm0, %xmm0
; SSE41-NEXT: roundps $8, %xmm1, %xmm1
; SSE41-NEXT: roundps $8, %xmm2, %xmm2
; SSE41-NEXT: roundps $8, %xmm3, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: roundeven_v16f32:
; AVX1: ## %bb.0:
; AVX1-NEXT: vroundps $8, %ymm0, %ymm0
; AVX1-NEXT: vroundps $8, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX512-LABEL: roundeven_v16f32:
; AVX512: ## %bb.0:
; AVX512-NEXT: vrndscaleps $8, %zmm0, %zmm0
; AVX512-NEXT: retq
%a = call <16 x float> @llvm.roundeven.v16f32(<16 x float> %x)
ret <16 x float> %a
}
define <8 x double> @roundeven_v8f64(<8 x double> %x) {
; SSE2-LABEL: roundeven_v8f64:
; SSE2: ## %bb.0:
; SSE2-NEXT: subq $88, %rsp
; SSE2-NEXT: .cfi_def_cfa_offset 96
; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq _roundeven
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload
; SSE2-NEXT: addq $88, %rsp
; SSE2-NEXT: retq
;
; SSE41-LABEL: roundeven_v8f64:
; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $8, %xmm0, %xmm0
; SSE41-NEXT: roundpd $8, %xmm1, %xmm1
; SSE41-NEXT: roundpd $8, %xmm2, %xmm2
; SSE41-NEXT: roundpd $8, %xmm3, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: roundeven_v8f64:
; AVX1: ## %bb.0:
; AVX1-NEXT: vroundpd $8, %ymm0, %ymm0
; AVX1-NEXT: vroundpd $8, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX512-LABEL: roundeven_v8f64:
; AVX512: ## %bb.0:
; AVX512-NEXT: vrndscalepd $8, %zmm0, %zmm0
; AVX512-NEXT: retq
%a = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %x)
ret <8 x double> %a
}
declare half @llvm.roundeven.f16(half)
declare float @llvm.roundeven.f32(float)
declare double @llvm.roundeven.f64(double)
declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)
declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
declare <8 x float> @llvm.roundeven.v8f32(<8 x float>)
declare <4 x double> @llvm.roundeven.v4f64(<4 x double>)
declare <16 x float> @llvm.roundeven.v16f32(<16 x float>)
declare <8 x double> @llvm.roundeven.v8f64(<8 x double>)