[libclc][NFC] Clang-format vload/vstore code
This commit is contained in:
@@ -6,23 +6,24 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
|
||||
_CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH(size_t offset, const ADDR_SPACE MEM_TYPE *x);
|
||||
#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
|
||||
_CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH( \
|
||||
size_t offset, const ADDR_SPACE MEM_TYPE *x);
|
||||
|
||||
#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
|
||||
#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
|
||||
|
||||
#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \
|
||||
#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global)
|
||||
|
||||
#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \
|
||||
#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \
|
||||
_CLC_VECTOR_VLOAD_PRIM3(, PRIM_TYPE, PRIM_TYPE)
|
||||
|
||||
// Declare vector load prototypes
|
||||
@@ -40,12 +41,12 @@ _CLC_VECTOR_VLOAD_PRIM3(_half, half, float)
|
||||
_CLC_VECTOR_VLOAD_PRIM3(a_half, half, float)
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
_CLC_VECTOR_VLOAD_PRIM1(double)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
_CLC_VECTOR_VLOAD_PRIM1(double)
|
||||
#endif
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16: enable
|
||||
_CLC_VECTOR_VLOAD_PRIM1(half)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
_CLC_VECTOR_VLOAD_PRIM1(half)
|
||||
#endif
|
||||
|
||||
// Scalar vload_half also needs to be declared
|
||||
|
||||
@@ -6,33 +6,34 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#define _CLC_VSTORE_DECL(SUFFIX, PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE, RND) \
|
||||
_CLC_OVERLOAD _CLC_DECL void vstore##SUFFIX##WIDTH##RND(VEC_TYPE vec, size_t offset, ADDR_SPACE PRIM_TYPE *out);
|
||||
#define _CLC_VSTORE_DECL(SUFFIX, PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE, RND) \
|
||||
_CLC_OVERLOAD _CLC_DECL void vstore##SUFFIX##WIDTH##RND( \
|
||||
VEC_TYPE vec, size_t offset, ADDR_SPACE PRIM_TYPE *out);
|
||||
|
||||
#define _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE, RND) \
|
||||
_CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE, RND) \
|
||||
_CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE, RND) \
|
||||
_CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE, RND) \
|
||||
_CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE, RND) \
|
||||
#define _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE, RND) \
|
||||
_CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE, RND) \
|
||||
_CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE, RND) \
|
||||
_CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE, RND) \
|
||||
_CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE, RND) \
|
||||
_CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE, RND)
|
||||
|
||||
#define _CLC_VECTOR_VSTORE_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE, RND) \
|
||||
_CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private, RND) \
|
||||
_CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local, RND) \
|
||||
#define _CLC_VECTOR_VSTORE_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE, RND) \
|
||||
_CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private, RND) \
|
||||
_CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local, RND) \
|
||||
_CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global, RND)
|
||||
|
||||
#define _CLC_VECTOR_VSTORE_PRIM1(PRIM_TYPE) \
|
||||
_CLC_VECTOR_VSTORE_PRIM3(,PRIM_TYPE, PRIM_TYPE, )
|
||||
#define _CLC_VECTOR_VSTORE_PRIM1(PRIM_TYPE) \
|
||||
_CLC_VECTOR_VSTORE_PRIM3(, PRIM_TYPE, PRIM_TYPE, )
|
||||
|
||||
#define _CLC_VECTOR_VSTORE_HALF_PRIM1(PRIM_TYPE, RND) \
|
||||
_CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __private, RND) \
|
||||
_CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __local, RND) \
|
||||
_CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __global, RND) \
|
||||
_CLC_VECTOR_VSTORE_PRIM3(_half, half, PRIM_TYPE, RND) \
|
||||
_CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __private, RND) \
|
||||
_CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __local, RND) \
|
||||
_CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __global, RND) \
|
||||
_CLC_VECTOR_VSTORE_PRIM3(a_half, half, PRIM_TYPE, RND)
|
||||
#define _CLC_VECTOR_VSTORE_HALF_PRIM1(PRIM_TYPE, RND) \
|
||||
_CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __private, RND) \
|
||||
_CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __local, RND) \
|
||||
_CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __global, RND) \
|
||||
_CLC_VECTOR_VSTORE_PRIM3(_half, half, PRIM_TYPE, RND) \
|
||||
_CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __private, RND) \
|
||||
_CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __local, RND) \
|
||||
_CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __global, RND) \
|
||||
_CLC_VECTOR_VSTORE_PRIM3(a_half, half, PRIM_TYPE, RND)
|
||||
|
||||
_CLC_VECTOR_VSTORE_PRIM1(char)
|
||||
_CLC_VECTOR_VSTORE_PRIM1(uchar)
|
||||
@@ -44,26 +45,25 @@ _CLC_VECTOR_VSTORE_PRIM1(long)
|
||||
_CLC_VECTOR_VSTORE_PRIM1(ulong)
|
||||
_CLC_VECTOR_VSTORE_PRIM1(float)
|
||||
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(float,)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(float, )
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtz)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtn)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtp)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rte)
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
_CLC_VECTOR_VSTORE_PRIM1(double)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(double,)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtz)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtn)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtp)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rte)
|
||||
_CLC_VECTOR_VSTORE_PRIM1(double)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(double, )
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtz)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtn)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtp)
|
||||
_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rte)
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_fp16
|
||||
_CLC_VECTOR_VSTORE_PRIM1(half)
|
||||
_CLC_VECTOR_VSTORE_PRIM1(half)
|
||||
#endif
|
||||
|
||||
|
||||
#undef _CLC_VSTORE_DECL
|
||||
#undef _CLC_VECTOR_VSTORE_DECL
|
||||
#undef _CLC_VECTOR_VSTORE_PRIM3
|
||||
|
||||
@@ -8,59 +8,75 @@
|
||||
|
||||
#include <clc/clc.h>
|
||||
|
||||
#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
|
||||
typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
|
||||
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
|
||||
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[2*offset])); \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
|
||||
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
|
||||
PRIM_TYPE##2 vec = *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[3*offset])); \
|
||||
return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset*3+2]); \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
|
||||
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
|
||||
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&x[4*offset])); \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
|
||||
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
|
||||
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&x[8*offset])); \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
|
||||
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
|
||||
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&x[16*offset])); \
|
||||
} \
|
||||
#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
|
||||
typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
|
||||
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
|
||||
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, \
|
||||
const ADDR_SPACE PRIM_TYPE *x) { \
|
||||
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
|
||||
*)(&x[2 * offset])); \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 \
|
||||
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
|
||||
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, \
|
||||
const ADDR_SPACE PRIM_TYPE *x) { \
|
||||
PRIM_TYPE##2 vec = \
|
||||
*((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
|
||||
*)(&x[3 * offset])); \
|
||||
return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]); \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
|
||||
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
|
||||
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, \
|
||||
const ADDR_SPACE PRIM_TYPE *x) { \
|
||||
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
|
||||
*)(&x[4 * offset])); \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
|
||||
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
|
||||
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, \
|
||||
const ADDR_SPACE PRIM_TYPE *x) { \
|
||||
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
|
||||
*)(&x[8 * offset])); \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
|
||||
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
|
||||
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16( \
|
||||
size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
|
||||
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
|
||||
*)(&x[16 * offset])); \
|
||||
}
|
||||
|
||||
#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
|
||||
VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
|
||||
VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
|
||||
VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
|
||||
VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
|
||||
#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
|
||||
VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
|
||||
VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
|
||||
VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
|
||||
VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global)
|
||||
|
||||
#define VLOAD_TYPES() \
|
||||
VLOAD_ADDR_SPACES(char) \
|
||||
VLOAD_ADDR_SPACES(uchar) \
|
||||
VLOAD_ADDR_SPACES(short) \
|
||||
VLOAD_ADDR_SPACES(ushort) \
|
||||
VLOAD_ADDR_SPACES(int) \
|
||||
VLOAD_ADDR_SPACES(uint) \
|
||||
VLOAD_ADDR_SPACES(long) \
|
||||
VLOAD_ADDR_SPACES(ulong) \
|
||||
VLOAD_ADDR_SPACES(float) \
|
||||
#define VLOAD_TYPES() \
|
||||
VLOAD_ADDR_SPACES(char) \
|
||||
VLOAD_ADDR_SPACES(uchar) \
|
||||
VLOAD_ADDR_SPACES(short) \
|
||||
VLOAD_ADDR_SPACES(ushort) \
|
||||
VLOAD_ADDR_SPACES(int) \
|
||||
VLOAD_ADDR_SPACES(uint) \
|
||||
VLOAD_ADDR_SPACES(long) \
|
||||
VLOAD_ADDR_SPACES(ulong) \
|
||||
VLOAD_ADDR_SPACES(float)
|
||||
|
||||
VLOAD_TYPES()
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
VLOAD_ADDR_SPACES(double)
|
||||
VLOAD_ADDR_SPACES(double)
|
||||
#endif
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
VLOAD_ADDR_SPACES(half)
|
||||
VLOAD_ADDR_SPACES(half)
|
||||
#endif
|
||||
|
||||
/* vload_half are legal even without cl_khr_fp16 */
|
||||
@@ -71,43 +87,45 @@ float __clc_vload_half_float_helper__global(const __global half *);
|
||||
float __clc_vload_half_float_helper__local(const __local half *);
|
||||
float __clc_vload_half_float_helper__private(const __private half *);
|
||||
|
||||
#define VEC_LOAD1(val, AS) val = __clc_vload_half_float_helper##AS (&mem[offset++]);
|
||||
#define VEC_LOAD1(val, AS) \
|
||||
val = __clc_vload_half_float_helper##AS(&mem[offset++]);
|
||||
#else
|
||||
#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]);
|
||||
#endif
|
||||
|
||||
#define VEC_LOAD2(val, AS) \
|
||||
VEC_LOAD1(val.lo, AS) \
|
||||
VEC_LOAD1(val.hi, AS)
|
||||
#define VEC_LOAD3(val, AS) \
|
||||
VEC_LOAD1(val.s0, AS) \
|
||||
VEC_LOAD1(val.s1, AS) \
|
||||
VEC_LOAD1(val.s2, AS)
|
||||
#define VEC_LOAD4(val, AS) \
|
||||
VEC_LOAD2(val.lo, AS) \
|
||||
VEC_LOAD2(val.hi, AS)
|
||||
#define VEC_LOAD8(val, AS) \
|
||||
VEC_LOAD4(val.lo, AS) \
|
||||
VEC_LOAD4(val.hi, AS)
|
||||
#define VEC_LOAD16(val, AS) \
|
||||
VEC_LOAD8(val.lo, AS) \
|
||||
VEC_LOAD8(val.hi, AS)
|
||||
#define VEC_LOAD2(val, AS) \
|
||||
VEC_LOAD1(val.lo, AS) \
|
||||
VEC_LOAD1(val.hi, AS)
|
||||
#define VEC_LOAD3(val, AS) \
|
||||
VEC_LOAD1(val.s0, AS) \
|
||||
VEC_LOAD1(val.s1, AS) \
|
||||
VEC_LOAD1(val.s2, AS)
|
||||
#define VEC_LOAD4(val, AS) \
|
||||
VEC_LOAD2(val.lo, AS) \
|
||||
VEC_LOAD2(val.hi, AS)
|
||||
#define VEC_LOAD8(val, AS) \
|
||||
VEC_LOAD4(val.lo, AS) \
|
||||
VEC_LOAD4(val.hi, AS)
|
||||
#define VEC_LOAD16(val, AS) \
|
||||
VEC_LOAD8(val.lo, AS) \
|
||||
VEC_LOAD8(val.hi, AS)
|
||||
|
||||
#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \
|
||||
_CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \
|
||||
offset *= VEC_SIZE; \
|
||||
TYPE __tmp; \
|
||||
VEC_LOAD##VEC_SIZE(__tmp, AS) \
|
||||
return __tmp; \
|
||||
} \
|
||||
_CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, const AS half *mem) { \
|
||||
offset *= OFFSET_SIZE; \
|
||||
TYPE __tmp; \
|
||||
VEC_LOAD##VEC_SIZE(__tmp, AS) \
|
||||
return __tmp; \
|
||||
#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \
|
||||
_CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, \
|
||||
const AS half *mem) { \
|
||||
offset *= VEC_SIZE; \
|
||||
TYPE __tmp; \
|
||||
VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \
|
||||
} \
|
||||
_CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, \
|
||||
const AS half *mem) { \
|
||||
offset *= OFFSET_SIZE; \
|
||||
TYPE __tmp; \
|
||||
VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \
|
||||
}
|
||||
|
||||
#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)
|
||||
#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \
|
||||
__FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)
|
||||
|
||||
#define __CLC_BODY "vload_half.inc"
|
||||
#include <clc/math/gentype.inc>
|
||||
|
||||
@@ -11,21 +11,21 @@
|
||||
#ifndef __CLC_SCALAR
|
||||
|
||||
#if __CLC_VECSIZE == 3
|
||||
# define __CLC_OFFSET 4
|
||||
#define __CLC_OFFSET 4
|
||||
#else
|
||||
# define __CLC_OFFSET __CLC_VECSIZE
|
||||
#define __CLC_OFFSET __CLC_VECSIZE
|
||||
#endif
|
||||
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __private);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __local);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __global);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __constant);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __private);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __local);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __global);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __constant);
|
||||
|
||||
#undef __CLC_OFFSET
|
||||
#else
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __private);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __local);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __global);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __constant);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __private);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __local);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __global);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __constant);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -10,36 +10,50 @@
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
|
||||
|
||||
#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
|
||||
typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
|
||||
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[2*offset])) = vec; \
|
||||
} \
|
||||
\
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
|
||||
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[3*offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \
|
||||
mem[3 * offset + 2] = vec.s2;\
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
|
||||
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&mem[4*offset])) = vec; \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
|
||||
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&mem[8*offset])) = vec; \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
|
||||
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&mem[16*offset])) = vec; \
|
||||
} \
|
||||
#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
|
||||
typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
|
||||
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, \
|
||||
ADDR_SPACE PRIM_TYPE *mem) { \
|
||||
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
|
||||
*)(&mem[2 * offset])) = vec; \
|
||||
} \
|
||||
\
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, \
|
||||
ADDR_SPACE PRIM_TYPE *mem) { \
|
||||
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
|
||||
*)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \
|
||||
mem[3 * offset + 2] = vec.s2; \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
|
||||
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, \
|
||||
ADDR_SPACE PRIM_TYPE *mem) { \
|
||||
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
|
||||
*)(&mem[4 * offset])) = vec; \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
|
||||
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, \
|
||||
ADDR_SPACE PRIM_TYPE *mem) { \
|
||||
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
|
||||
*)(&mem[8 * offset])) = vec; \
|
||||
} \
|
||||
\
|
||||
typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
|
||||
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, \
|
||||
ADDR_SPACE PRIM_TYPE *mem) { \
|
||||
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
|
||||
*)(&mem[16 * offset])) = vec; \
|
||||
}
|
||||
|
||||
#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
|
||||
VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
|
||||
VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
|
||||
VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
|
||||
#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
|
||||
VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
|
||||
VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
|
||||
VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global)
|
||||
|
||||
VSTORE_ADDR_SPACES(char)
|
||||
VSTORE_ADDR_SPACES(uchar)
|
||||
@@ -51,26 +65,25 @@ VSTORE_ADDR_SPACES(long)
|
||||
VSTORE_ADDR_SPACES(ulong)
|
||||
VSTORE_ADDR_SPACES(float)
|
||||
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
VSTORE_ADDR_SPACES(double)
|
||||
VSTORE_ADDR_SPACES(double)
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
VSTORE_ADDR_SPACES(half)
|
||||
VSTORE_ADDR_SPACES(half)
|
||||
#endif
|
||||
|
||||
/* vstore_half are legal even without cl_khr_fp16 */
|
||||
#if __clang_major__ < 6
|
||||
#define DECLARE_HELPER(STYPE, AS, builtin) void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *);
|
||||
#define DECLARE_HELPER(STYPE, AS, builtin) \
|
||||
void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *);
|
||||
#else
|
||||
#define DECLARE_HELPER(STYPE, AS, __builtin) \
|
||||
_CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) \
|
||||
{ \
|
||||
__builtin(s, d); \
|
||||
}
|
||||
#define DECLARE_HELPER(STYPE, AS, __builtin) \
|
||||
_CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) { \
|
||||
__builtin(s, d); \
|
||||
}
|
||||
#endif
|
||||
|
||||
DECLARE_HELPER(float, __private, __builtin_store_halff);
|
||||
@@ -83,176 +96,165 @@ DECLARE_HELPER(double, __global, __builtin_store_half);
|
||||
DECLARE_HELPER(double, __local, __builtin_store_half);
|
||||
#endif
|
||||
|
||||
#define VEC_STORE1(STYPE, AS, val, ROUNDF) __clc_vstore_half_##STYPE##_helper##AS (ROUNDF(val), &mem[offset++]);
|
||||
#define VEC_STORE1(STYPE, AS, val, ROUNDF) \
|
||||
__clc_vstore_half_##STYPE##_helper##AS(ROUNDF(val), &mem[offset++]);
|
||||
|
||||
#define VEC_STORE2(STYPE, AS, val, ROUNDF) \
|
||||
VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \
|
||||
VEC_STORE1(STYPE, AS, val.hi, ROUNDF)
|
||||
#define VEC_STORE3(STYPE, AS, val, ROUNDF) \
|
||||
VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \
|
||||
VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \
|
||||
VEC_STORE1(STYPE, AS, val.s2, ROUNDF)
|
||||
#define VEC_STORE4(STYPE, AS, val, ROUNDF) \
|
||||
VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \
|
||||
VEC_STORE2(STYPE, AS, val.hi, ROUNDF)
|
||||
#define VEC_STORE8(STYPE, AS, val, ROUNDF) \
|
||||
VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \
|
||||
VEC_STORE4(STYPE, AS, val.hi, ROUNDF)
|
||||
#define VEC_STORE16(STYPE, AS, val, ROUNDF) \
|
||||
VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \
|
||||
VEC_STORE8(STYPE, AS, val.hi, ROUNDF)
|
||||
#define VEC_STORE2(STYPE, AS, val, ROUNDF) \
|
||||
VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \
|
||||
VEC_STORE1(STYPE, AS, val.hi, ROUNDF)
|
||||
#define VEC_STORE3(STYPE, AS, val, ROUNDF) \
|
||||
VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \
|
||||
VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \
|
||||
VEC_STORE1(STYPE, AS, val.s2, ROUNDF)
|
||||
#define VEC_STORE4(STYPE, AS, val, ROUNDF) \
|
||||
VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \
|
||||
VEC_STORE2(STYPE, AS, val.hi, ROUNDF)
|
||||
#define VEC_STORE8(STYPE, AS, val, ROUNDF) \
|
||||
VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \
|
||||
VEC_STORE4(STYPE, AS, val.hi, ROUNDF)
|
||||
#define VEC_STORE16(STYPE, AS, val, ROUNDF) \
|
||||
VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \
|
||||
VEC_STORE8(STYPE, AS, val.hi, ROUNDF)
|
||||
|
||||
#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ROUNDF) \
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \
|
||||
offset *= VEC_SIZE; \
|
||||
VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \
|
||||
} \
|
||||
_CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \
|
||||
offset *= OFFSET; \
|
||||
VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \
|
||||
#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ROUNDF) \
|
||||
_CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, \
|
||||
AS half *mem) { \
|
||||
offset *= VEC_SIZE; \
|
||||
VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \
|
||||
} \
|
||||
_CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, \
|
||||
AS half *mem) { \
|
||||
offset *= OFFSET; \
|
||||
VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \
|
||||
}
|
||||
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x)
|
||||
{
|
||||
return x;
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; }
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) {
|
||||
/* Remove lower 13 bits to make sure the number is rounded down */
|
||||
int mask = 0xffffe000;
|
||||
const int exp = (as_uint(x) >> 23 & 0xff) - 127;
|
||||
/* Denormals cannot be flushed, and they use different bit for rounding */
|
||||
if (exp < -14)
|
||||
mask <<= min(-(exp + 14), 10);
|
||||
/* RTZ does not produce Inf for large numbers */
|
||||
if (fabs(x) > 65504.0f && !isinf(x))
|
||||
return copysign(65504.0f, x);
|
||||
/* Handle nan corner case */
|
||||
if (isnan(x))
|
||||
return x;
|
||||
return as_float(as_uint(x) & mask);
|
||||
}
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x)
|
||||
{
|
||||
/* Remove lower 13 bits to make sure the number is rounded down */
|
||||
int mask = 0xffffe000;
|
||||
const int exp = (as_uint(x) >> 23 & 0xff) - 127;
|
||||
/* Denormals cannot be flushed, and they use different bit for rounding */
|
||||
if (exp < -14)
|
||||
mask <<= min(-(exp + 14), 10);
|
||||
/* RTZ does not produce Inf for large numbers */
|
||||
if (fabs(x) > 65504.0f && !isinf(x))
|
||||
return copysign(65504.0f, x);
|
||||
/* Handle nan corner case */
|
||||
if (isnan(x))
|
||||
return x;
|
||||
return as_float(as_uint(x) & mask);
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) {
|
||||
const float inf = copysign(INFINITY, x);
|
||||
/* Set lower 13 bits */
|
||||
int mask = (1 << 13) - 1;
|
||||
const int exp = (as_uint(x) >> 23 & 0xff) - 127;
|
||||
/* Denormals cannot be flushed, and they use different bit for rounding */
|
||||
if (exp < -14)
|
||||
mask = (1 << (13 + min(-(exp + 14), 10))) - 1;
|
||||
/* Handle nan corner case */
|
||||
if (isnan(x))
|
||||
return x;
|
||||
const float next = nextafter(as_float(as_uint(x) | mask), inf);
|
||||
return ((as_uint(x) & mask) == 0) ? x : next;
|
||||
}
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x)
|
||||
{
|
||||
const float inf = copysign(INFINITY, x);
|
||||
/* Set lower 13 bits */
|
||||
int mask = (1 << 13) - 1;
|
||||
const int exp = (as_uint(x) >> 23 & 0xff) - 127;
|
||||
/* Denormals cannot be flushed, and they use different bit for rounding */
|
||||
if (exp < -14)
|
||||
mask = (1 << (13 + min(-(exp + 14), 10))) - 1;
|
||||
/* Handle nan corner case */
|
||||
if (isnan(x))
|
||||
return x;
|
||||
const float next = nextafter(as_float(as_uint(x) | mask), inf);
|
||||
return ((as_uint(x) & mask) == 0) ? x : next;
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) {
|
||||
return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
|
||||
}
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x)
|
||||
{
|
||||
return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) {
|
||||
return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
|
||||
}
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x)
|
||||
{
|
||||
return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
|
||||
}
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x)
|
||||
{
|
||||
/* Mantisa + implicit bit */
|
||||
const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23);
|
||||
const int exp = (as_uint(x) >> 23 & 0xff) - 127;
|
||||
int shift = 13;
|
||||
if (exp < -14) {
|
||||
/* The default assumes lower 13 bits are rounded,
|
||||
* but it might be more for denormals.
|
||||
* Shifting beyond last == 0b, and qr == 00b is not necessary */
|
||||
shift += min(-(exp + 14), 15);
|
||||
}
|
||||
int mask = (1 << shift) - 1;
|
||||
const uint grs = mantissa & mask;
|
||||
const uint last = mantissa & (1 << shift);
|
||||
/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
|
||||
* exp > 15 should round to inf. */
|
||||
bool roundup = (grs > (1 << (shift - 1))) ||
|
||||
(grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
|
||||
return roundup ? __clc_rti(x) : __clc_rtz(x);
|
||||
_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) {
|
||||
/* Mantisa + implicit bit */
|
||||
const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23);
|
||||
const int exp = (as_uint(x) >> 23 & 0xff) - 127;
|
||||
int shift = 13;
|
||||
if (exp < -14) {
|
||||
/* The default assumes lower 13 bits are rounded,
|
||||
* but it might be more for denormals.
|
||||
* Shifting beyond last == 0b, and qr == 00b is not necessary */
|
||||
shift += min(-(exp + 14), 15);
|
||||
}
|
||||
int mask = (1 << shift) - 1;
|
||||
const uint grs = mantissa & mask;
|
||||
const uint last = mantissa & (1 << shift);
|
||||
/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
|
||||
* exp > 15 should round to inf. */
|
||||
bool roundup = (grs > (1 << (shift - 1))) ||
|
||||
(grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
|
||||
return roundup ? __clc_rti(x) : __clc_rtz(x);
|
||||
}
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x)
|
||||
{
|
||||
return x;
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; }
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) {
|
||||
/* Remove lower 42 bits to make sure the number is rounded down */
|
||||
ulong mask = 0xfffffc0000000000UL;
|
||||
const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
|
||||
/* Denormals cannot be flushed, and they use different bit for rounding */
|
||||
if (exp < -14)
|
||||
mask <<= min(-(exp + 14), 10);
|
||||
/* RTZ does not produce Inf for large numbers */
|
||||
if (fabs(x) > 65504.0 && !isinf(x))
|
||||
return copysign(65504.0, x);
|
||||
/* Handle nan corner case */
|
||||
if (isnan(x))
|
||||
return x;
|
||||
return as_double(as_ulong(x) & mask);
|
||||
}
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x)
|
||||
{
|
||||
/* Remove lower 42 bits to make sure the number is rounded down */
|
||||
ulong mask = 0xfffffc0000000000UL;
|
||||
const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
|
||||
/* Denormals cannot be flushed, and they use different bit for rounding */
|
||||
if (exp < -14)
|
||||
mask <<= min(-(exp + 14), 10);
|
||||
/* RTZ does not produce Inf for large numbers */
|
||||
if (fabs(x) > 65504.0 && !isinf(x))
|
||||
return copysign(65504.0, x);
|
||||
/* Handle nan corner case */
|
||||
if (isnan(x))
|
||||
return x;
|
||||
return as_double(as_ulong(x) & mask);
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) {
|
||||
const double inf = copysign((double)INFINITY, x);
|
||||
/* Set lower 42 bits */
|
||||
long mask = (1UL << 42UL) - 1UL;
|
||||
const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
|
||||
/* Denormals cannot be flushed, and they use different bit for rounding */
|
||||
if (exp < -14)
|
||||
mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1;
|
||||
/* Handle nan corner case */
|
||||
if (isnan(x))
|
||||
return x;
|
||||
const double next = nextafter(as_double(as_ulong(x) | mask), inf);
|
||||
return ((as_ulong(x) & mask) == 0) ? x : next;
|
||||
}
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x)
|
||||
{
|
||||
const double inf = copysign((double)INFINITY, x);
|
||||
/* Set lower 42 bits */
|
||||
long mask = (1UL << 42UL) - 1UL;
|
||||
const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
|
||||
/* Denormals cannot be flushed, and they use different bit for rounding */
|
||||
if (exp < -14)
|
||||
mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1;
|
||||
/* Handle nan corner case */
|
||||
if (isnan(x))
|
||||
return x;
|
||||
const double next = nextafter(as_double(as_ulong(x) | mask), inf);
|
||||
return ((as_ulong(x) & mask) == 0) ? x : next;
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) {
|
||||
return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x)
|
||||
: __clc_rti(x);
|
||||
}
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x)
|
||||
{
|
||||
return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) : __clc_rti(x);
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) {
|
||||
return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x)
|
||||
: __clc_rtz(x);
|
||||
}
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x)
|
||||
{
|
||||
return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) : __clc_rtz(x);
|
||||
}
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x)
|
||||
{
|
||||
/* Mantisa + implicit bit */
|
||||
const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52);
|
||||
const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
|
||||
int shift = 42;
|
||||
if (exp < -14) {
|
||||
/* The default assumes lower 13 bits are rounded,
|
||||
* but it might be more for denormals.
|
||||
* Shifting beyond last == 0b, and qr == 00b is not necessary */
|
||||
shift += min(-(exp + 14), 15);
|
||||
}
|
||||
ulong mask = (1UL << shift) - 1UL;
|
||||
const ulong grs = mantissa & mask;
|
||||
const ulong last = mantissa & (1UL << shift);
|
||||
/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
|
||||
* exp > 15 should round to inf. */
|
||||
bool roundup = (grs > (1UL << (shift - 1UL))) ||
|
||||
(grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15);
|
||||
return roundup ? __clc_rti(x) : __clc_rtz(x);
|
||||
_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) {
|
||||
/* Mantisa + implicit bit */
|
||||
const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52);
|
||||
const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
|
||||
int shift = 42;
|
||||
if (exp < -14) {
|
||||
/* The default assumes lower 13 bits are rounded,
|
||||
* but it might be more for denormals.
|
||||
* Shifting beyond last == 0b, and qr == 00b is not necessary */
|
||||
shift += min(-(exp + 14), 15);
|
||||
}
|
||||
ulong mask = (1UL << shift) - 1UL;
|
||||
const ulong grs = mantissa & mask;
|
||||
const ulong last = mantissa & (1UL << shift);
|
||||
/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
|
||||
* exp > 15 should round to inf. */
|
||||
bool roundup = (grs > (1UL << (shift - 1UL))) ||
|
||||
(grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15);
|
||||
return roundup ? __clc_rti(x) : __clc_rtz(x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
|
||||
__FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop) \
|
||||
__FUNC(SUFFIX ## _rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz) \
|
||||
__FUNC(SUFFIX ## _rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn) \
|
||||
__FUNC(SUFFIX ## _rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp) \
|
||||
__FUNC(SUFFIX ## _rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte)
|
||||
#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
|
||||
__FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop) \
|
||||
__FUNC(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz) \
|
||||
__FUNC(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn) \
|
||||
__FUNC(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp) \
|
||||
__FUNC(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte)
|
||||
|
||||
#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
|
||||
__XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS)
|
||||
#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
|
||||
__XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS)
|
||||
|
||||
#define __CLC_BODY "vstore_half.inc"
|
||||
#include <clc/math/gentype.inc>
|
||||
|
||||
@@ -11,19 +11,22 @@
|
||||
#ifndef __CLC_SCALAR
|
||||
|
||||
#if __CLC_VECSIZE == 3
|
||||
# define __CLC_OFFSET 4
|
||||
#define __CLC_OFFSET 4
|
||||
#else
|
||||
# define __CLC_OFFSET __CLC_VECSIZE
|
||||
#define __CLC_OFFSET __CLC_VECSIZE
|
||||
#endif
|
||||
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE,
|
||||
__CLC_SCALAR_GENTYPE, __private);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE,
|
||||
__CLC_SCALAR_GENTYPE, __local);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE,
|
||||
__CLC_SCALAR_GENTYPE, __global);
|
||||
|
||||
#undef __CLC_OFFSET
|
||||
#else
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local);
|
||||
FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user