This adds the Arm Optimized Routines (see https://github.com/ARM-software/optimized-routines) source code under the the LLVM license. The version of the code provided in this patch is v20.02 of the Arm Optimized Routines project. This entire contribution is being committed as is even though it does not currently fit the LLVM libc model and does not follow the LLVM coding style. In the near future, implementations from this patch will be moved over to their right place in the LLVM-libc tree. This will be done over many small patches, all of which will go through the normal LLVM code review process. See this libc-dev post for the plan: http://lists.llvm.org/pipermail/libc-dev/2020-March/000044.html Differential revision of the original upload: https://reviews.llvm.org/D75355
643 lines
11 KiB
C
643 lines
11 KiB
C
/*
|
|
* Vector math abstractions.
|
|
*
|
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
* See https://llvm.org/LICENSE.txt for license information.
|
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
*/
|
|
|
|
#ifndef _V_MATH_H
|
|
#define _V_MATH_H
|
|
|
|
#ifndef WANT_VMATH
|
|
/* Enable the build of vector math code. */
|
|
# define WANT_VMATH 1
|
|
#endif
|
|
#if WANT_VMATH
|
|
|
|
/* The goal of this header is to allow vector and scalar
|
|
build of the same algorithm, the provided intrinsic
|
|
wrappers are also vector length agnostic so they can
|
|
be implemented for SVE too (or other simd architectures)
|
|
and then the code should work on those targets too. */
|
|
|
|
#if SCALAR
|
|
#define V_NAME(x) __s_##x
|
|
#elif VPCS && __aarch64__
|
|
#define V_NAME(x) __vn_##x
|
|
#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
|
|
#else
|
|
#define V_NAME(x) __v_##x
|
|
#endif
|
|
|
|
#ifndef VPCS_ATTR
|
|
#define VPCS_ATTR
|
|
#endif
|
|
#ifndef VPCS_ALIAS
|
|
#define VPCS_ALIAS
|
|
#endif
|
|
|
|
#include <stdint.h>
|
|
#include "math_config.h"
|
|
|
|
typedef float f32_t;
|
|
typedef uint32_t u32_t;
|
|
typedef int32_t s32_t;
|
|
typedef double f64_t;
|
|
typedef uint64_t u64_t;
|
|
typedef int64_t s64_t;
|
|
|
|
/* reinterpret as type1 from type2. */
|
|
static inline u32_t
|
|
as_u32_f32 (f32_t x)
|
|
{
|
|
union { f32_t f; u32_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline f32_t
|
|
as_f32_u32 (u32_t x)
|
|
{
|
|
union { u32_t u; f32_t f; } r = {x};
|
|
return r.f;
|
|
}
|
|
static inline s32_t
|
|
as_s32_u32 (u32_t x)
|
|
{
|
|
union { u32_t u; s32_t i; } r = {x};
|
|
return r.i;
|
|
}
|
|
static inline u32_t
|
|
as_u32_s32 (s32_t x)
|
|
{
|
|
union { s32_t i; u32_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline u64_t
|
|
as_u64_f64 (f64_t x)
|
|
{
|
|
union { f64_t f; u64_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline f64_t
|
|
as_f64_u64 (u64_t x)
|
|
{
|
|
union { u64_t u; f64_t f; } r = {x};
|
|
return r.f;
|
|
}
|
|
static inline s64_t
|
|
as_s64_u64 (u64_t x)
|
|
{
|
|
union { u64_t u; s64_t i; } r = {x};
|
|
return r.i;
|
|
}
|
|
static inline u64_t
|
|
as_u64_s64 (s64_t x)
|
|
{
|
|
union { s64_t i; u64_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
|
|
#if SCALAR
|
|
#define V_SUPPORTED 1
|
|
typedef f32_t v_f32_t;
|
|
typedef u32_t v_u32_t;
|
|
typedef s32_t v_s32_t;
|
|
typedef f64_t v_f64_t;
|
|
typedef u64_t v_u64_t;
|
|
typedef s64_t v_s64_t;
|
|
|
|
static inline int
|
|
v_lanes32 (void)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
static inline v_f32_t
|
|
v_f32 (f32_t x)
|
|
{
|
|
return x;
|
|
}
|
|
static inline v_u32_t
|
|
v_u32 (u32_t x)
|
|
{
|
|
return x;
|
|
}
|
|
static inline v_s32_t
|
|
v_s32 (s32_t x)
|
|
{
|
|
return x;
|
|
}
|
|
|
|
static inline f32_t
|
|
v_get_f32 (v_f32_t x, int i)
|
|
{
|
|
return x;
|
|
}
|
|
static inline u32_t
|
|
v_get_u32 (v_u32_t x, int i)
|
|
{
|
|
return x;
|
|
}
|
|
static inline s32_t
|
|
v_get_s32 (v_s32_t x, int i)
|
|
{
|
|
return x;
|
|
}
|
|
|
|
static inline void
|
|
v_set_f32 (v_f32_t *x, int i, f32_t v)
|
|
{
|
|
*x = v;
|
|
}
|
|
static inline void
|
|
v_set_u32 (v_u32_t *x, int i, u32_t v)
|
|
{
|
|
*x = v;
|
|
}
|
|
static inline void
|
|
v_set_s32 (v_s32_t *x, int i, s32_t v)
|
|
{
|
|
*x = v;
|
|
}
|
|
|
|
/* true if any elements of a v_cond result is non-zero. */
|
|
static inline int
|
|
v_any_u32 (v_u32_t x)
|
|
{
|
|
return x != 0;
|
|
}
|
|
/* to wrap the result of relational operators. */
|
|
static inline v_u32_t
|
|
v_cond_u32 (v_u32_t x)
|
|
{
|
|
return x ? -1 : 0;
|
|
}
|
|
static inline v_f32_t
|
|
v_abs_f32 (v_f32_t x)
|
|
{
|
|
return __builtin_fabsf (x);
|
|
}
|
|
static inline v_f32_t
|
|
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
|
|
{
|
|
return __builtin_fmaf (x, y, z);
|
|
}
|
|
static inline v_f32_t
|
|
v_round_f32 (v_f32_t x)
|
|
{
|
|
return __builtin_roundf (x);
|
|
}
|
|
static inline v_s32_t
|
|
v_round_s32 (v_f32_t x)
|
|
{
|
|
return __builtin_lroundf (x); /* relies on -fno-math-errno. */
|
|
}
|
|
/* convert to type1 from type2. */
|
|
static inline v_f32_t
|
|
v_to_f32_s32 (v_s32_t x)
|
|
{
|
|
return x;
|
|
}
|
|
static inline v_f32_t
|
|
v_to_f32_u32 (v_u32_t x)
|
|
{
|
|
return x;
|
|
}
|
|
/* reinterpret as type1 from type2. */
|
|
static inline v_u32_t
|
|
v_as_u32_f32 (v_f32_t x)
|
|
{
|
|
union { v_f32_t f; v_u32_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline v_f32_t
|
|
v_as_f32_u32 (v_u32_t x)
|
|
{
|
|
union { v_u32_t u; v_f32_t f; } r = {x};
|
|
return r.f;
|
|
}
|
|
static inline v_s32_t
|
|
v_as_s32_u32 (v_u32_t x)
|
|
{
|
|
union { v_u32_t u; v_s32_t i; } r = {x};
|
|
return r.i;
|
|
}
|
|
static inline v_u32_t
|
|
v_as_u32_s32 (v_s32_t x)
|
|
{
|
|
union { v_s32_t i; v_u32_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline v_f32_t
|
|
v_lookup_f32 (const f32_t *tab, v_u32_t idx)
|
|
{
|
|
return tab[idx];
|
|
}
|
|
static inline v_u32_t
|
|
v_lookup_u32 (const u32_t *tab, v_u32_t idx)
|
|
{
|
|
return tab[idx];
|
|
}
|
|
static inline v_f32_t
|
|
v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
|
|
{
|
|
return f (x);
|
|
}
|
|
static inline v_f32_t
|
|
v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
|
|
v_u32_t p)
|
|
{
|
|
return f (x1, x2);
|
|
}
|
|
|
|
static inline int
|
|
v_lanes64 (void)
|
|
{
|
|
return 1;
|
|
}
|
|
static inline v_f64_t
|
|
v_f64 (f64_t x)
|
|
{
|
|
return x;
|
|
}
|
|
static inline v_u64_t
|
|
v_u64 (u64_t x)
|
|
{
|
|
return x;
|
|
}
|
|
static inline v_s64_t
|
|
v_s64 (s64_t x)
|
|
{
|
|
return x;
|
|
}
|
|
static inline f64_t
|
|
v_get_f64 (v_f64_t x, int i)
|
|
{
|
|
return x;
|
|
}
|
|
static inline void
|
|
v_set_f64 (v_f64_t *x, int i, f64_t v)
|
|
{
|
|
*x = v;
|
|
}
|
|
/* true if any elements of a v_cond result is non-zero. */
|
|
static inline int
|
|
v_any_u64 (v_u64_t x)
|
|
{
|
|
return x != 0;
|
|
}
|
|
/* to wrap the result of relational operators. */
|
|
static inline v_u64_t
|
|
v_cond_u64 (v_u64_t x)
|
|
{
|
|
return x ? -1 : 0;
|
|
}
|
|
static inline v_f64_t
|
|
v_abs_f64 (v_f64_t x)
|
|
{
|
|
return __builtin_fabs (x);
|
|
}
|
|
static inline v_f64_t
|
|
v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
|
|
{
|
|
return __builtin_fma (x, y, z);
|
|
}
|
|
static inline v_f64_t
|
|
v_round_f64 (v_f64_t x)
|
|
{
|
|
return __builtin_round (x);
|
|
}
|
|
static inline v_s64_t
|
|
v_round_s64 (v_f64_t x)
|
|
{
|
|
return __builtin_lround (x); /* relies on -fno-math-errno. */
|
|
}
|
|
/* convert to type1 from type2. */
|
|
static inline v_f64_t
|
|
v_to_f64_s64 (v_s64_t x)
|
|
{
|
|
return x;
|
|
}
|
|
static inline v_f64_t
|
|
v_to_f64_u64 (v_u64_t x)
|
|
{
|
|
return x;
|
|
}
|
|
/* reinterpret as type1 from type2. */
|
|
static inline v_u64_t
|
|
v_as_u64_f64 (v_f64_t x)
|
|
{
|
|
union { v_f64_t f; v_u64_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline v_f64_t
|
|
v_as_f64_u64 (v_u64_t x)
|
|
{
|
|
union { v_u64_t u; v_f64_t f; } r = {x};
|
|
return r.f;
|
|
}
|
|
static inline v_s64_t
|
|
v_as_s64_u64 (v_u64_t x)
|
|
{
|
|
union { v_u64_t u; v_s64_t i; } r = {x};
|
|
return r.i;
|
|
}
|
|
static inline v_u64_t
|
|
v_as_u64_s64 (v_s64_t x)
|
|
{
|
|
union { v_s64_t i; v_u64_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline v_f64_t
|
|
v_lookup_f64 (const f64_t *tab, v_u64_t idx)
|
|
{
|
|
return tab[idx];
|
|
}
|
|
static inline v_u64_t
|
|
v_lookup_u64 (const u64_t *tab, v_u64_t idx)
|
|
{
|
|
return tab[idx];
|
|
}
|
|
static inline v_f64_t
|
|
v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
|
|
{
|
|
return f (x);
|
|
}
|
|
|
|
#elif __aarch64__
|
|
#define V_SUPPORTED 1
|
|
#include <arm_neon.h>
|
|
typedef float32x4_t v_f32_t;
|
|
typedef uint32x4_t v_u32_t;
|
|
typedef int32x4_t v_s32_t;
|
|
typedef float64x2_t v_f64_t;
|
|
typedef uint64x2_t v_u64_t;
|
|
typedef int64x2_t v_s64_t;
|
|
|
|
static inline int
|
|
v_lanes32 (void)
|
|
{
|
|
return 4;
|
|
}
|
|
|
|
static inline v_f32_t
|
|
v_f32 (f32_t x)
|
|
{
|
|
return (v_f32_t){x, x, x, x};
|
|
}
|
|
static inline v_u32_t
|
|
v_u32 (u32_t x)
|
|
{
|
|
return (v_u32_t){x, x, x, x};
|
|
}
|
|
static inline v_s32_t
|
|
v_s32 (s32_t x)
|
|
{
|
|
return (v_s32_t){x, x, x, x};
|
|
}
|
|
|
|
static inline f32_t
|
|
v_get_f32 (v_f32_t x, int i)
|
|
{
|
|
return x[i];
|
|
}
|
|
static inline u32_t
|
|
v_get_u32 (v_u32_t x, int i)
|
|
{
|
|
return x[i];
|
|
}
|
|
static inline s32_t
|
|
v_get_s32 (v_s32_t x, int i)
|
|
{
|
|
return x[i];
|
|
}
|
|
|
|
static inline void
|
|
v_set_f32 (v_f32_t *x, int i, f32_t v)
|
|
{
|
|
(*x)[i] = v;
|
|
}
|
|
static inline void
|
|
v_set_u32 (v_u32_t *x, int i, u32_t v)
|
|
{
|
|
(*x)[i] = v;
|
|
}
|
|
static inline void
|
|
v_set_s32 (v_s32_t *x, int i, s32_t v)
|
|
{
|
|
(*x)[i] = v;
|
|
}
|
|
|
|
/* true if any elements of a v_cond result is non-zero. */
|
|
static inline int
|
|
v_any_u32 (v_u32_t x)
|
|
{
|
|
/* assume elements in x are either 0 or -1u. */
|
|
return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
|
|
}
|
|
/* to wrap the result of relational operators. */
|
|
static inline v_u32_t
|
|
v_cond_u32 (v_u32_t x)
|
|
{
|
|
return x;
|
|
}
|
|
static inline v_f32_t
|
|
v_abs_f32 (v_f32_t x)
|
|
{
|
|
return vabsq_f32 (x);
|
|
}
|
|
static inline v_f32_t
|
|
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
|
|
{
|
|
return vfmaq_f32 (z, x, y);
|
|
}
|
|
static inline v_f32_t
|
|
v_round_f32 (v_f32_t x)
|
|
{
|
|
return vrndaq_f32 (x);
|
|
}
|
|
static inline v_s32_t
|
|
v_round_s32 (v_f32_t x)
|
|
{
|
|
return vcvtaq_s32_f32 (x);
|
|
}
|
|
/* convert to type1 from type2. */
|
|
static inline v_f32_t
|
|
v_to_f32_s32 (v_s32_t x)
|
|
{
|
|
return (v_f32_t){x[0], x[1], x[2], x[3]};
|
|
}
|
|
static inline v_f32_t
|
|
v_to_f32_u32 (v_u32_t x)
|
|
{
|
|
return (v_f32_t){x[0], x[1], x[2], x[3]};
|
|
}
|
|
/* reinterpret as type1 from type2. */
|
|
static inline v_u32_t
|
|
v_as_u32_f32 (v_f32_t x)
|
|
{
|
|
union { v_f32_t f; v_u32_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline v_f32_t
|
|
v_as_f32_u32 (v_u32_t x)
|
|
{
|
|
union { v_u32_t u; v_f32_t f; } r = {x};
|
|
return r.f;
|
|
}
|
|
static inline v_s32_t
|
|
v_as_s32_u32 (v_u32_t x)
|
|
{
|
|
union { v_u32_t u; v_s32_t i; } r = {x};
|
|
return r.i;
|
|
}
|
|
static inline v_u32_t
|
|
v_as_u32_s32 (v_s32_t x)
|
|
{
|
|
union { v_s32_t i; v_u32_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline v_f32_t
|
|
v_lookup_f32 (const f32_t *tab, v_u32_t idx)
|
|
{
|
|
return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
|
|
}
|
|
static inline v_u32_t
|
|
v_lookup_u32 (const u32_t *tab, v_u32_t idx)
|
|
{
|
|
return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
|
|
}
|
|
static inline v_f32_t
|
|
v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
|
|
{
|
|
return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
|
|
p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
|
|
}
|
|
static inline v_f32_t
|
|
v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
|
|
v_u32_t p)
|
|
{
|
|
return (
|
|
v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
|
|
p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
|
|
}
|
|
|
|
static inline int
|
|
v_lanes64 (void)
|
|
{
|
|
return 2;
|
|
}
|
|
static inline v_f64_t
|
|
v_f64 (f64_t x)
|
|
{
|
|
return (v_f64_t){x, x};
|
|
}
|
|
static inline v_u64_t
|
|
v_u64 (u64_t x)
|
|
{
|
|
return (v_u64_t){x, x};
|
|
}
|
|
static inline v_s64_t
|
|
v_s64 (s64_t x)
|
|
{
|
|
return (v_s64_t){x, x};
|
|
}
|
|
static inline f64_t
|
|
v_get_f64 (v_f64_t x, int i)
|
|
{
|
|
return x[i];
|
|
}
|
|
static inline void
|
|
v_set_f64 (v_f64_t *x, int i, f64_t v)
|
|
{
|
|
(*x)[i] = v;
|
|
}
|
|
/* true if any elements of a v_cond result is non-zero. */
|
|
static inline int
|
|
v_any_u64 (v_u64_t x)
|
|
{
|
|
/* assume elements in x are either 0 or -1u. */
|
|
return vpaddd_u64 (x) != 0;
|
|
}
|
|
/* to wrap the result of relational operators. */
|
|
static inline v_u64_t
|
|
v_cond_u64 (v_u64_t x)
|
|
{
|
|
return x;
|
|
}
|
|
static inline v_f64_t
|
|
v_abs_f64 (v_f64_t x)
|
|
{
|
|
return vabsq_f64 (x);
|
|
}
|
|
static inline v_f64_t
|
|
v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
|
|
{
|
|
return vfmaq_f64 (z, x, y);
|
|
}
|
|
static inline v_f64_t
|
|
v_round_f64 (v_f64_t x)
|
|
{
|
|
return vrndaq_f64 (x);
|
|
}
|
|
static inline v_s64_t
|
|
v_round_s64 (v_f64_t x)
|
|
{
|
|
return vcvtaq_s64_f64 (x);
|
|
}
|
|
/* convert to type1 from type2. */
|
|
static inline v_f64_t
|
|
v_to_f64_s64 (v_s64_t x)
|
|
{
|
|
return (v_f64_t){x[0], x[1]};
|
|
}
|
|
static inline v_f64_t
|
|
v_to_f64_u64 (v_u64_t x)
|
|
{
|
|
return (v_f64_t){x[0], x[1]};
|
|
}
|
|
/* reinterpret as type1 from type2. */
|
|
static inline v_u64_t
|
|
v_as_u64_f64 (v_f64_t x)
|
|
{
|
|
union { v_f64_t f; v_u64_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline v_f64_t
|
|
v_as_f64_u64 (v_u64_t x)
|
|
{
|
|
union { v_u64_t u; v_f64_t f; } r = {x};
|
|
return r.f;
|
|
}
|
|
static inline v_s64_t
|
|
v_as_s64_u64 (v_u64_t x)
|
|
{
|
|
union { v_u64_t u; v_s64_t i; } r = {x};
|
|
return r.i;
|
|
}
|
|
static inline v_u64_t
|
|
v_as_u64_s64 (v_s64_t x)
|
|
{
|
|
union { v_s64_t i; v_u64_t u; } r = {x};
|
|
return r.u;
|
|
}
|
|
static inline v_f64_t
|
|
v_lookup_f64 (const f64_t *tab, v_u64_t idx)
|
|
{
|
|
return (v_f64_t){tab[idx[0]], tab[idx[1]]};
|
|
}
|
|
static inline v_u64_t
|
|
v_lookup_u64 (const u64_t *tab, v_u64_t idx)
|
|
{
|
|
return (v_u64_t){tab[idx[0]], tab[idx[1]]};
|
|
}
|
|
static inline v_f64_t
|
|
v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
|
|
{
|
|
return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
|
|
}
|
|
#endif
|
|
|
|
#endif
|
|
#endif
|