Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
Alexey Bataev 7dca2c628c [SLP]Gather scalarized calls
If the calls won't be vectorized, but will be scalarized after
vectorization, they should be build as buildvector nodes, not vector
nodes. Vectorization of such calls leads to incorrect cost estimation,
does not allow to calculate correctly spills costs.

Reviewers: lukel97, preames

Reviewed By: preames

Pull Request: https://github.com/llvm/llvm-project/pull/125070
2025-02-04 19:09:57 -05:00

1412 lines
80 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -vector-library=Accelerate -S %s | FileCheck %s
; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -S %s | FileCheck --check-prefix NOACCELERATE %s
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-ios14.0.0"
declare float @llvm.sin.f32(float)
; Accelerate provides sin() for <4 x float>
define <4 x float> @int_sin_4x(ptr %a) {
; CHECK-LABEL: @int_sin_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @int_sin_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @llvm.sin.f32(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @llvm.sin.f32(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @llvm.sin.f32(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @llvm.sin.f32(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @ceilf(float) readonly nounwind willreturn
define <4 x float> @ceil_4x(ptr %a) {
; CHECK-LABEL: @ceil_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @ceil_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @ceilf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @ceilf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @ceilf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @ceilf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @fabsf(float) readonly nounwind willreturn
define <4 x float> @fabs_4x(ptr %a) {
; CHECK-LABEL: @fabs_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @fabs_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @fabsf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @fabsf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @fabsf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @fabsf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @llvm.fabs.f32(float)
define <4 x float> @int_fabs_4x(ptr %a) {
; CHECK-LABEL: @int_fabs_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @int_fabs_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @llvm.fabs.f32(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @llvm.fabs.f32(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @llvm.fabs.f32(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @llvm.fabs.f32(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @floorf(float) readonly nounwind willreturn
define <4 x float> @floor_4x(ptr %a) {
; CHECK-LABEL: @floor_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @floor_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @floorf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @floorf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @floorf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @floorf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @sqrtf(float) readonly nounwind willreturn
define <4 x float> @sqrt_4x(ptr %a) {
; CHECK-LABEL: @sqrt_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @sqrt_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @sqrtf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @sqrtf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @sqrtf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @sqrtf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @expf(float) readonly nounwind willreturn
define <4 x float> @exp_4x(ptr %a) {
; CHECK-LABEL: @exp_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @exp_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @expf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @expf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @expf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @expf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @expm1f(float) readonly nounwind willreturn
define <4 x float> @expm1_4x(ptr %a) {
; CHECK-LABEL: @expm1_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @expm1_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expm1f(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expm1f(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expm1f(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @expm1f(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @expm1f(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @expm1f(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @expm1f(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @expm1f(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @logf(float) readonly nounwind willreturn
define <4 x float> @log_4x(ptr %a) {
; CHECK-LABEL: @log_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @log_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @logf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @logf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @logf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @logf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @log1pf(float) readonly nounwind willreturn
define <4 x float> @log1p_4x(ptr %a) {
; CHECK-LABEL: @log1p_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @log1p_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @log1pf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @log1pf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @log1pf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @log1pf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @log1pf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @log1pf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @log1pf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @log1pf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @log10pf(float) readonly nounwind willreturn
define <4 x float> @log10p_4x(ptr %a) {
; CHECK-LABEL: @log10p_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
;
; NOACCELERATE-LABEL: @log10p_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @log10pf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @log10pf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @log10pf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @log10pf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @logbf(float) readonly nounwind willreturn
define <4 x float> @logb_4x(ptr %a) {
; CHECK-LABEL: @logb_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @logb_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logbf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logbf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logbf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @logbf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @logbf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @logbf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @logbf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @logbf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @sinf(float) readonly nounwind willreturn
define <4 x float> @sin_4x(ptr %a) {
; CHECK-LABEL: @sin_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @sin_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @sinf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @sinf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @sinf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @sinf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @cosf(float) readonly nounwind willreturn
define <4 x float> @cos_4x(ptr %a) {
; CHECK-LABEL: @cos_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @cos_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @cosf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @cosf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @cosf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @cosf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @tanf(float) readonly nounwind willreturn
define <4 x float> @tan_4x(ptr %a) {
; CHECK-LABEL: @tan_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @tan_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @tanf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @tanf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @tanf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @tanf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @asinf(float) readonly nounwind willreturn
define <4 x float> @asin_4x(ptr %a) {
; CHECK-LABEL: @asin_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @asin_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @asinf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @asinf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @asinf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @asinf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @asinf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @asinf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @asinf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
define <4 x float> @int_asin_4x(ptr %a) {
; CHECK-LABEL: @int_asin_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @int_asin_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @llvm.asin.f32(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @llvm.asin.f32(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @llvm.asin.f32(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @llvm.asin.f32(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @acosf(float) readonly nounwind willreturn
define <4 x float> @acos_4x(ptr %a) {
; CHECK-LABEL: @acos_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @acos_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @acosf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @acosf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @acosf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @acosf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
define <4 x float> @int_acos_4x(ptr %a) {
; CHECK-LABEL: @int_acos_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @int_acos_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @llvm.acos.f32(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @llvm.acos.f32(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @llvm.acos.f32(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @llvm.acos.f32(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @atanf(float) readonly nounwind willreturn
define <4 x float> @atan_4x(ptr %a) {
; CHECK-LABEL: @atan_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @atan_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @atanf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @atanf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @atanf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @atanf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
define <4 x float> @int_atan_4x(ptr %a) {
; CHECK-LABEL: @int_atan_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @int_atan_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @llvm.atan.f32(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @llvm.atan.f32(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @llvm.atan.f32(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @llvm.atan.f32(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @atan2f(float,float) readonly nounwind willreturn
define <4 x float> @atan2_4x(ptr %a, ptr %b) {
; CHECK-LABEL: @atan2_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @atan2_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atan2f(float [[VECEXT]], float [[VECEXTB]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atan2f(float [[VECEXT_1]], float [[VECEXTB_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[VECEXTB_2:%.*]] = extractelement <4 x float> [[BB]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atan2f(float [[VECEXT_2]], float [[VECEXTB_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[VECEXTB_3:%.*]] = extractelement <4 x float> [[BB]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atan2f(float [[VECEXT_3]], float [[VECEXTB_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%bb = load <4 x float>, ptr %b, align 16
%vecext = extractelement <4 x float> %0, i32 0
%vecextb = extractelement <4 x float> %bb, i32 0
%1 = tail call fast float @atan2f(float %vecext, float %vecextb)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%vecextb.1 = extractelement <4 x float> %bb, i32 1
%2 = tail call fast float @atan2f(float %vecext.1, float %vecextb.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%vecextb.2 = extractelement <4 x float> %bb, i32 2
%3 = tail call fast float @atan2f(float %vecext.2, float %vecextb.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%vecextb.3 = extractelement <4 x float> %bb, i32 3
%4 = tail call fast float @atan2f(float %vecext.3, float %vecextb.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
; CHECK-LABEL: @int_atan2_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @int_atan2_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT]], float [[VECEXTB]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_1]], float [[VECEXTB_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[VECEXTB_2:%.*]] = extractelement <4 x float> [[BB]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_2]], float [[VECEXTB_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[VECEXTB_3:%.*]] = extractelement <4 x float> [[BB]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_3]], float [[VECEXTB_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%bb = load <4 x float>, ptr %b, align 16
%vecext = extractelement <4 x float> %0, i32 0
%vecextb = extractelement <4 x float> %bb, i32 0
%1 = tail call fast float @llvm.atan2.f32(float %vecext, float %vecextb)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%vecextb.1 = extractelement <4 x float> %bb, i32 1
%2 = tail call fast float @llvm.atan2.f32(float %vecext.1, float %vecextb.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%vecextb.2 = extractelement <4 x float> %bb, i32 2
%3 = tail call fast float @llvm.atan2.f32(float %vecext.2, float %vecextb.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%vecextb.3 = extractelement <4 x float> %bb, i32 3
%4 = tail call fast float @llvm.atan2.f32(float %vecext.3, float %vecextb.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @sinhf(float) readonly nounwind willreturn
define <4 x float> @sinh_4x(ptr %a) {
; CHECK-LABEL: @sinh_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @sinh_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @sinhf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @sinhf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @sinhf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @sinhf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
define <4 x float> @int_sinh_4x(ptr %a) {
; CHECK-LABEL: @int_sinh_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @int_sinh_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @llvm.sinh.f32(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @llvm.sinh.f32(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @llvm.sinh.f32(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @llvm.sinh.f32(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @coshf(float) readonly nounwind willreturn
define <4 x float> @cosh_4x(ptr %a) {
; CHECK-LABEL: @cosh_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @cosh_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @coshf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @coshf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @coshf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @coshf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @coshf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @coshf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @coshf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
define <4 x float> @int_cosh_4x(ptr %a) {
; CHECK-LABEL: @int_cosh_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @int_cosh_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @llvm.cosh.f32(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @llvm.cosh.f32(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @llvm.cosh.f32(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @llvm.cosh.f32(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @tanhf(float) readonly nounwind willreturn
define <4 x float> @tanh_4x(ptr %a) {
; CHECK-LABEL: @tanh_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @tanh_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @tanhf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @tanhf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @tanhf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @tanhf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
define <4 x float> @int_tanh_4x(ptr %a) {
; CHECK-LABEL: @int_tanh_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @int_tanh_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @llvm.tanh.f32(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @llvm.tanh.f32(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @llvm.tanh.f32(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @llvm.tanh.f32(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @asinhf(float) readonly nounwind willreturn
define <4 x float> @asinh_4x(ptr %a) {
; CHECK-LABEL: @asinh_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinhf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @asinh_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @asinhf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @asinhf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @asinhf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @asinhf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @asinhf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @asinhf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @acoshf(float) readonly nounwind willreturn
define <4 x float> @acosh_4x(ptr %a) {
; CHECK-LABEL: @acosh_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacoshf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @acosh_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @acoshf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @acoshf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @acoshf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @acoshf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @acoshf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @acoshf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
declare float @atanhf(float) readonly nounwind willreturn
define <4 x float> @atanh_4x(ptr %a) {
; CHECK-LABEL: @atanh_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanhf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @atanh_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atanhf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atanhf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atanhf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atanhf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @atanhf(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @atanhf(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @atanhf(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @atanhf(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
; Accelerate *does not* provide sin() for <2 x float>.
define <2 x float> @sin_2x(ptr %a) {
; CHECK-LABEL: @sin_2x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP1]], i32 0
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
;
; NOACCELERATE-LABEL: @sin_2x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: ret <2 x float> [[VECINS_1]]
;
entry:
%0 = load <2 x float>, ptr %a, align 16
%vecext = extractelement <2 x float> %0, i32 0
%1 = tail call fast float @llvm.sin.f32(float %vecext)
%vecins = insertelement <2 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <2 x float> %0, i32 1
%2 = tail call fast float @llvm.sin.f32(float %vecext.1)
%vecins.1 = insertelement <2 x float> %vecins, float %2, i32 1
ret <2 x float> %vecins.1
}
declare float @llvm.cos.f32(float)
; Accelerate provides cos() for <4 x float>
define <4 x float> @int_cos_4x(ptr %a) {
; CHECK-LABEL: @int_cos_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
; NOACCELERATE-LABEL: @int_cos_4x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
%vecext = extractelement <4 x float> %0, i32 0
%1 = tail call fast float @llvm.cos.f32(float %vecext)
%vecins = insertelement <4 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <4 x float> %0, i32 1
%2 = tail call fast float @llvm.cos.f32(float %vecext.1)
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
%vecext.2 = extractelement <4 x float> %0, i32 2
%3 = tail call fast float @llvm.cos.f32(float %vecext.2)
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
%vecext.3 = extractelement <4 x float> %0, i32 3
%4 = tail call fast float @llvm.cos.f32(float %vecext.3)
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
ret <4 x float> %vecins.3
}
; Accelerate *does not* provide cos() for <2 x float>.
define <2 x float> @cos_2x(ptr %a) {
; CHECK-LABEL: @cos_2x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP1]], i32 0
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
;
; NOACCELERATE-LABEL: @cos_2x(
; NOACCELERATE-NEXT: entry:
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: ret <2 x float> [[VECINS_1]]
;
entry:
%0 = load <2 x float>, ptr %a, align 16
%vecext = extractelement <2 x float> %0, i32 0
%1 = tail call fast float @llvm.cos.f32(float %vecext)
%vecins = insertelement <2 x float> zeroinitializer, float %1, i32 0
%vecext.1 = extractelement <2 x float> %0, i32 1
%2 = tail call fast float @llvm.cos.f32(float %vecext.1)
%vecins.1 = insertelement <2 x float> %vecins, float %2, i32 1
ret <2 x float> %vecins.1
}