[libclc] Move sin, cos & sincos to CLC library (#139527)

This commit moves the remaining FP64 sin and cos helper functions to the CLC library. As a consequence, it formally moves all sin, cos and sincos builtins to the CLC library. Previously, the FP16 and FP32 were nominally there but still in the OpenCL layer while waiting for the FP64 ones. The FP64 builtins are now vectorized as the FP16 and FP32 ones were earlier. One helper table had to be changed. It was previously a table of bytes loaded by each work-item as uint4. Since this doesn't vectorize well, the table was split to load two ulongNs per work-item. While this might not be as efficient on some devices, one mitigating factor is that we were previously loading 48 bytes per work-item in total, but only using 40 of them. With this commit we only load the bytes we need.
2025-05-12 11:32:15 +01:00
parent 2ec13c513f
commit 4f107cd8f8
30 changed files with 611 additions and 516 deletions
--- a/libclc/clc/include/clc/math/clc_cos.h
+++ b/libclc/clc/include/clc/math/clc_cos.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_COS_H__
+#define __CLC_MATH_CLC_COS_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_cos
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_COS_H__
--- a/libclc/clc/include/clc/math/clc_sin.h
+++ b/libclc/clc/include/clc/math/clc_sin.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_SIN_H__
+#define __CLC_MATH_CLC_SIN_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_sin
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_SIN_H__
--- a/libclc/clc/include/clc/math/clc_sincos.h
+++ b/libclc/clc/include/clc/math/clc_sincos.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_SINCOS_H__
+#define __CLC_MATH_CLC_SINCOS_H__
+
+#define __CLC_BODY <clc/math/unary_decl_with_ptr.inc>
+#define __CLC_FUNCTION __clc_sincos
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_SINCOS_H__
--- a/libclc/clc/include/clc/math/clc_sincos_helpers.h
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers.h
@@ -16,4 +16,11 @@

 #undef __FLOAT_ONLY

+#define __DOUBLE_ONLY
+#define __CLC_BODY <clc/math/clc_sincos_helpers_fp64.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __DOUBLE_ONLY
+
 #endif // __CLC_MATH_CLC_SINCOS_HELPERS_H__
--- a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
@@ -0,0 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DECL _CLC_OVERLOAD void
+__clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
+                             private __CLC_DOUBLEN *rr,
+                             private __CLC_INTN *regn);
+
+_CLC_DECL _CLC_OVERLOAD void
+__clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
+                            private __CLC_DOUBLEN *rr,
+                            private __CLC_INTN *regn);
--- a/libclc/clc/include/clc/math/tables.h
+++ b/libclc/clc/include/clc/math/tables.h
@@ -61,7 +61,6 @@

 TABLE_FUNCTION_DECL(float2, log2_tbl);
 TABLE_FUNCTION_DECL(float2, log10_tbl);
-TABLE_FUNCTION_DECL(uint4, pibits_tbl);

 CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_head);
 CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_tail);
@@ -75,6 +74,7 @@ CLC_TABLE_FUNCTION_DECL(float, cbrt_tbl_head);
 CLC_TABLE_FUNCTION_DECL(float, cbrt_tbl_tail);
 CLC_TABLE_FUNCTION_DECL(float, sinhcosh_tbl_head);
 CLC_TABLE_FUNCTION_DECL(float, sinhcosh_tbl_tail);
+CLC_TABLE_FUNCTION_DECL(ulong, pibits_tbl);

 #ifdef cl_khr_fp64

--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -32,6 +32,7 @@ math/clc_atanpi.cl
 math/clc_cbrt.cl
 math/clc_ceil.cl
 math/clc_copysign.cl
+math/clc_cos.cl
 math/clc_cosh.cl
 math/clc_cospi.cl
 math/clc_ep_log.cl
@@ -86,6 +87,8 @@ math/clc_rint.cl
 math/clc_rootn.cl
 math/clc_round.cl
 math/clc_rsqrt.cl
+math/clc_sin.cl
+math/clc_sincos.cl
 math/clc_sincos_helpers.cl
 math/clc_sinh.cl
 math/clc_sinpi.cl
--- a/libclc/clc/lib/generic/math/clc_cos.cl
+++ b/libclc/clc/lib/generic/math/clc_cos.cl
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/clc_convert.h>
+#include <clc/clcmacro.h>
+#include <clc/float/definitions.h>
+#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_sincos_helpers.h>
+#include <clc/math/clc_sincos_piby4.h>
+#include <clc/math/math.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/relational/clc_select.h>
+
+#define __CLC_BODY <clc_cos.inc>
+#include <clc/math/gentype.inc>
--- a/libclc/clc/lib/generic/math/clc_cos.inc
+++ b/libclc/clc/lib/generic/math/clc_cos.inc
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __CLC_FPSIZE == 32
+
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_cos(__CLC_FLOATN x) {
+  __CLC_FLOATN absx = __clc_fabs(x);
+
+  __CLC_FLOATN r0, r1;
+  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
+
+  __CLC_FLOATN ss = -__clc_sinf_piby4(r0, r1);
+  __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
+
+  __CLC_FLOATN c = (regn & 1) != 0 ? ss : cc;
+  c = __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31));
+
+  c = __clc_select(c, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
+
+  return c;
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) {
+  return __CLC_CONVERT_GENTYPE(__clc_cos(__CLC_CONVERT_FLOATN(x)));
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) {
+  __CLC_GENTYPE absx = __clc_fabs(x);
+
+  __CLC_BIT_INTN is_medium = absx < 0x1.0p+47;
+
+  __CLC_INTN regn_m, regn_l;
+  __CLC_GENTYPE r_m, r_l, rr_m, rr_l;
+
+  __clc_remainder_piby2_medium(absx, &r_m, &rr_m, &regn_m);
+  __clc_remainder_piby2_large(absx, &r_l, &rr_l, &regn_l);
+
+  __CLC_GENTYPE r = is_medium ? r_m : r_l;
+  __CLC_GENTYPE rr = is_medium ? rr_m : rr_l;
+  __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l;
+
+  __CLC_GENTYPE sinval, cosval;
+  __clc_sincos_piby4(r, rr, &sinval, &cosval);
+  sinval = -sinval;
+
+  __CLC_LONGN c =
+      __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? sinval : cosval);
+  c ^= __CLC_CONVERT_BIT_INTN(regn > 1) << 63;
+
+  return __clc_isnan(absx) | __clc_isinf(absx) ? __CLC_GENTYPE_NAN
+                                               : __CLC_AS_GENTYPE(c);
+}
+
+#endif
--- a/libclc/clc/lib/generic/math/clc_sin.cl
+++ b/libclc/clc/lib/generic/math/clc_sin.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/clc_convert.h>
+#include <clc/clcmacro.h>
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_sincos_helpers.h>
+#include <clc/math/clc_sincos_piby4.h>
+#include <clc/math/clc_trunc.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/relational/clc_select.h>
+#include <clc/shared/clc_max.h>
+
+#define __CLC_BODY <clc_sin.inc>
+#include <clc/math/gentype.inc>
--- a/libclc/clc/lib/generic/math/clc_sin.inc
+++ b/libclc/clc/lib/generic/math/clc_sin.inc
@@ -0,0 +1,68 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __CLC_FPSIZE == 32
+
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_sin(__CLC_FLOATN x) {
+  __CLC_FLOATN absx = __clc_fabs(x);
+
+  __CLC_FLOATN r0, r1;
+  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
+
+  __CLC_FLOATN ss = __clc_sinf_piby4(r0, r1);
+  __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
+
+  __CLC_FLOATN s = (regn & 1) != 0 ? cc : ss;
+  s = __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^
+                      (__CLC_AS_INTN(x) ^ __CLC_AS_INTN(absx)));
+
+  s = __clc_select(s, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
+
+  // Subnormals
+  s = x == 0.0f ? x : s;
+
+  return s;
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) {
+  return __CLC_CONVERT_GENTYPE(__clc_sin(__CLC_CONVERT_FLOATN(x)));
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) {
+  __CLC_GENTYPE absx = __clc_fabs(x);
+
+  __CLC_BIT_INTN is_medium = absx < 0x1.0p+47;
+
+  __CLC_INTN regn_m, regn_l;
+  __CLC_GENTYPE r_m, r_l, rr_m, rr_l;
+
+  __clc_remainder_piby2_medium(absx, &r_m, &rr_m, &regn_m);
+  __clc_remainder_piby2_large(absx, &r_l, &rr_l, &regn_l);
+
+  __CLC_GENTYPE r = is_medium ? r_m : r_l;
+  __CLC_GENTYPE rr = is_medium ? rr_m : rr_l;
+  __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l;
+
+  __CLC_GENTYPE sinval, cosval;
+  __clc_sincos_piby4(r, rr, &sinval, &cosval);
+
+  __CLC_LONGN s =
+      __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? cosval : sinval);
+
+  s ^= (__CLC_CONVERT_BIT_INTN(regn > 1) << 63) ^
+       (__CLC_CONVERT_BIT_INTN(x < 0.0) << 63);
+
+  return __clc_isinf(x) | __clc_isnan(x) ? __CLC_GENTYPE_NAN
+                                         : __CLC_AS_GENTYPE(s);
+}
+
+#endif
--- a/libclc/clc/lib/generic/math/clc_sincos.cl
+++ b/libclc/clc/lib/generic/math/clc_sincos.cl
@@ -0,0 +1,14 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/internal/clc.h>
+#include <clc/math/clc_cos.h>
+#include <clc/math/clc_sin.h>
+
+#define __CLC_BODY <clc_sincos.inc>
+#include <clc/math/gentype.inc>
--- a/libclc/clc/lib/generic/math/clc_sincos.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos.inc
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//

-#define __CLC_DECLARE_SINCOS(ADDRSPACE, TYPE) \
-  _CLC_OVERLOAD _CLC_DEF TYPE sincos (TYPE x, ADDRSPACE TYPE * cosval) { \
-    *cosval = cos(x); \
-    return sin(x); \
+#define __CLC_DECLARE_SINCOS(ADDRSPACE, TYPE)                                  \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_sincos(TYPE x, ADDRSPACE TYPE *cosval) {   \
+    *cosval = __clc_cos(x);                                                    \
+    return __clc_sin(x);                                                       \
  }

 __CLC_DECLARE_SINCOS(global, __CLC_GENTYPE)
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
@@ -31,3 +31,27 @@
 #define __CLC_BODY <clc_sincos_helpers.inc>

 #include <clc/math/gentype.inc>
+
+#undef __FLOAT_ONLY
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#include <clc/math/clc_fract.h>
+#include <clc/math/tables.h>
+#include <clc/shared/clc_max.h>
+
+#define bytealign(src0, src1, src2)                                            \
+  (__CLC_CONVERT_UINTN(                                                        \
+      ((__CLC_CONVERT_LONGN((src0)) << 32) | __CLC_CONVERT_LONGN((src1))) >>   \
+      (((src2) & 3) * 8)))
+
+#define __DOUBLE_ONLY
+#define __CLC_BODY <clc_sincos_helpers_fp64.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __DOUBLE_ONLY
+
+#endif
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
@@ -0,0 +1,235 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Reduction for medium sized arguments
+_CLC_DEF _CLC_OVERLOAD void
+__clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
+                             private __CLC_DOUBLEN *rr,
+                             private __CLC_INTN *regn) {
+  // How many pi/2 is x a multiple of?
+  const __CLC_DOUBLEN two_by_pi = 0x1.45f306dc9c883p-1;
+  __CLC_DOUBLEN dnpi2 = __clc_trunc(__clc_fma(x, two_by_pi, 0.5));
+
+  const __CLC_DOUBLEN piby2_h = -7074237752028440.0 / 0x1.0p+52;
+  const __CLC_DOUBLEN piby2_m = -2483878800010755.0 / 0x1.0p+105;
+  const __CLC_DOUBLEN piby2_t = -3956492004828932.0 / 0x1.0p+158;
+
+  // Compute product of npi2 with 159 bits of 2/pi
+  __CLC_DOUBLEN p_hh = piby2_h * dnpi2;
+  __CLC_DOUBLEN p_ht = __clc_fma(piby2_h, dnpi2, -p_hh);
+  __CLC_DOUBLEN p_mh = piby2_m * dnpi2;
+  __CLC_DOUBLEN p_mt = __clc_fma(piby2_m, dnpi2, -p_mh);
+  __CLC_DOUBLEN p_th = piby2_t * dnpi2;
+  __CLC_DOUBLEN p_tt = __clc_fma(piby2_t, dnpi2, -p_th);
+
+  // Reduce to 159 bits
+  __CLC_DOUBLEN ph = p_hh;
+  __CLC_DOUBLEN pm = p_ht + p_mh;
+  __CLC_DOUBLEN t = p_mh - (pm - p_ht);
+  __CLC_DOUBLEN pt = p_th + t + p_mt + p_tt;
+  t = ph + pm;
+  pm = pm - (t - ph);
+  ph = t;
+  t = pm + pt;
+  pt = pt - (t - pm);
+  pm = t;
+
+  // Subtract from x
+  t = x + ph;
+  __CLC_DOUBLEN qh = t + pm;
+  __CLC_DOUBLEN qt = pm - (qh - t) + pt;
+
+  *r = qh;
+  *rr = qt;
+  *regn = __CLC_CONVERT_INTN(__CLC_CONVERT_LONGN(dnpi2)) & 0x3;
+}
+
+// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
+// extra precision, and return the result in r, rr.
+// Return value "regn" tells how many lots of pi/2 were subtracted
+// from x to put it in the range [-pi/4,pi/4], mod 4.
+_CLC_DEF _CLC_OVERLOAD void
+__clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
+                            private __CLC_DOUBLEN *rr,
+                            private __CLC_INTN *regn) {
+
+  __CLC_LONGN ux = __CLC_AS_LONGN(x);
+  __CLC_INTN e = __CLC_CONVERT_INTN(ux >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
+  __CLC_INTN i = __clc_max(23, (e >> 3) + 17);
+  __CLC_INTN j = 150 - i;
+  __CLC_INTN j16 = j & ~0xf;
+  __CLC_DOUBLEN fract_temp;
+
+  // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary
+  // byte boundary
+  __CLC_ULONGN q0 = USE_TABLE(pibits_tbl, j16);
+  __CLC_ULONGN q1 = USE_TABLE(pibits_tbl, (j16 + 8));
+  __CLC_ULONGN q2 = USE_TABLE(pibits_tbl, (j16 + 16));
+  __CLC_ULONGN q3 = USE_TABLE(pibits_tbl, (j16 + 24));
+  __CLC_ULONGN q4 = USE_TABLE(pibits_tbl, (j16 + 32));
+
+  __CLC_UINTN q0s0 = __CLC_CONVERT_UINTN(q0);
+  __CLC_UINTN q0s1 = __CLC_CONVERT_UINTN(q0 >> 32);
+
+  __CLC_UINTN q1s0 = __CLC_CONVERT_UINTN(q1);
+  __CLC_UINTN q1s1 = __CLC_CONVERT_UINTN(q1 >> 32);
+
+  __CLC_UINTN q2s0 = __CLC_CONVERT_UINTN(q2);
+  __CLC_UINTN q2s1 = __CLC_CONVERT_UINTN(q2 >> 32);
+
+  __CLC_UINTN q3s0 = __CLC_CONVERT_UINTN(q3);
+  __CLC_UINTN q3s1 = __CLC_CONVERT_UINTN(q3 >> 32);
+
+  __CLC_UINTN q4s0 = __CLC_CONVERT_UINTN(q4);
+  __CLC_UINTN q4s1 = __CLC_CONVERT_UINTN(q4 >> 32);
+
+  __CLC_INTN k = (j >> 2) & 0x3;
+  __CLC_INTN c0 = k == 0;
+  __CLC_INTN c1 = k == 1;
+  __CLC_INTN c2 = k == 2;
+  __CLC_INTN c3 = k == 3;
+
+  __CLC_UINTN u0, u1, u2, u3, u4, u5, u6;
+
+  u0 = c1 ? q0s1 : q0s0;
+  u0 = c2 ? q1s0 : u0;
+  u0 = c3 ? q1s1 : u0;
+
+  u1 = c1 ? q1s0 : q0s1;
+  u1 = c2 ? q1s1 : u1;
+  u1 = c3 ? q2s0 : u1;
+
+  u2 = c1 ? q1s1 : q1s0;
+  u2 = c2 ? q2s0 : u2;
+  u2 = c3 ? q2s1 : u2;
+
+  u3 = c1 ? q2s0 : q1s1;
+  u3 = c2 ? q2s1 : u3;
+  u3 = c3 ? q3s0 : u3;
+
+  u4 = c1 ? q2s1 : q2s0;
+  u4 = c2 ? q3s0 : u4;
+  u4 = c3 ? q3s1 : u4;
+
+  u5 = c1 ? q3s0 : q2s1;
+  u5 = c2 ? q3s1 : u5;
+  u5 = c3 ? q4s0 : u5;
+
+  u6 = c1 ? q3s1 : q3s0;
+  u6 = c2 ? q4s0 : u6;
+  u6 = c3 ? q4s1 : u6;
+
+  __CLC_UINTN v0 = bytealign(u1, u0, j);
+  __CLC_UINTN v1 = bytealign(u2, u1, j);
+  __CLC_UINTN v2 = bytealign(u3, u2, j);
+  __CLC_UINTN v3 = bytealign(u4, u3, j);
+  __CLC_UINTN v4 = bytealign(u5, u4, j);
+  __CLC_UINTN v5 = bytealign(u6, u5, j);
+
+  // Place those 192 bits in 4 48-bit doubles along with correct exponent
+  // If i > 1018 we would get subnormals so we scale p up and x down to get the
+  // same product
+  i = 2 + 8 * i;
+  x *= __CLC_CONVERT_BIT_INTN(i > 1018) ? 0x1.0p-136 : 1.0;
+  i -= i > 1018 ? 136 : 0;
+
+#define doublen_lohi(x, y)                                                     \
+  __CLC_AS_DOUBLEN(__CLC_CONVERT_ULONGN((x)) & 0xFFFFFFFF |                    \
+                   __CLC_CONVERT_ULONGN((y)) << 32)
+
+  __CLC_UINTN ua = __CLC_CONVERT_UINTN(EXPBIAS_DP64 + EXPSHIFTBITS_DP64 - i)
+                   << 20;
+  __CLC_DOUBLEN a = doublen_lohi((__CLC_ULONGN)0, ua);
+  __CLC_DOUBLEN p0 = doublen_lohi(v0, ua | (v1 & 0xffffU)) - a;
+  ua += 0x03000000U;
+  a = doublen_lohi((__CLC_ULONGN)0, ua);
+  __CLC_DOUBLEN p1 =
+      doublen_lohi(((v2 << 16) | (v1 >> 16)), (ua | (v2 >> 16))) - a;
+  ua += 0x03000000U;
+  a = doublen_lohi((__CLC_ULONGN)0, ua);
+  __CLC_DOUBLEN p2 = doublen_lohi(v3, (ua | (v4 & 0xffffU))) - a;
+  ua += 0x03000000U;
+  a = doublen_lohi((__CLC_ULONGN)0, ua);
+  __CLC_DOUBLEN p3 =
+      doublen_lohi(((v5 << 16) | (v4 >> 16)), (ua | (v5 >> 16))) - a;
+
+#undef doublen_lohi
+
+  // Exact multiply
+  __CLC_DOUBLEN f0h = p0 * x;
+  __CLC_DOUBLEN f0l = __clc_fma(p0, x, -f0h);
+  __CLC_DOUBLEN f1h = p1 * x;
+  __CLC_DOUBLEN f1l = __clc_fma(p1, x, -f1h);
+  __CLC_DOUBLEN f2h = p2 * x;
+  __CLC_DOUBLEN f2l = __clc_fma(p2, x, -f2h);
+  __CLC_DOUBLEN f3h = p3 * x;
+  __CLC_DOUBLEN f3l = __clc_fma(p3, x, -f3h);
+
+  // Accumulate product into 4 doubles
+  __CLC_DOUBLEN s, t;
+
+  __CLC_DOUBLEN f3 = f3h + f2h;
+  t = f2h - (f3 - f3h);
+  s = f3l + t;
+  t = t - (s - f3l);
+
+  __CLC_DOUBLEN f2 = s + f1h;
+  t = f1h - (f2 - s) + t;
+  s = f2l + t;
+  t = t - (s - f2l);
+
+  __CLC_DOUBLEN f1 = s + f0h;
+  t = f0h - (f1 - s) + t;
+  s = f1l + t;
+
+  __CLC_DOUBLEN f0 = s + f0l;
+
+  // Strip off unwanted large integer bits
+  f3 = 0x1.0p+10 * __clc_fract(f3 * 0x1.0p-10, &fract_temp);
+  f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+
+  // Compute least significant integer bits
+  t = f3 + f2;
+  __CLC_DOUBLEN di = t - __clc_fract(t, &fract_temp);
+  i = __CLC_CONVERT_INTN(__CLC_CONVERT_FLOATN(di));
+
+  // Shift out remaining integer part
+  f3 -= di;
+  s = f3 + f2;
+  t = f2 - (s - f3);
+  f3 = s;
+  f2 = t;
+  s = f2 + f1;
+  t = f1 - (s - f2);
+  f2 = s;
+  f1 = t;
+  f1 += f0;
+
+  // Subtract 1 if fraction is >= 0.5, and update regn
+  __CLC_INTN g = __CLC_CONVERT_INTN(f3 >= 0.5 ? 1L : 0L);
+  i += g;
+  f3 -= __CLC_CONVERT_DOUBLEN(__CLC_CONVERT_FLOATN(g));
+
+  // Shift up bits
+  s = f3 + f2;
+  t = f2 - (s - f3);
+  f3 = s;
+  f2 = t + f1;
+
+  // Multiply precise fraction by pi/2 to get radians
+  const __CLC_DOUBLEN p2h = 7074237752028440.0 / 0x1.0p+52;
+  const __CLC_DOUBLEN p2t = 4967757600021510.0 / 0x1.0p+106;
+
+  __CLC_DOUBLEN rhi = f3 * p2h;
+  __CLC_DOUBLEN rlo =
+      __clc_fma(f2, p2h, __clc_fma(f3, p2t, __clc_fma(f3, p2h, -rhi)));
+
+  *r = rhi + rlo;
+  *rr = rlo - (*r - rhi);
+  *regn = i & 0x3;
+}
--- a/libclc/clc/lib/generic/math/clc_tables.cl
+++ b/libclc/clc/lib/generic/math/clc_tables.cl
@@ -1648,4 +1648,64 @@ CLC_TABLE_FUNCTION(double, SINH_TBL_TAIL, sinh_tbl_tail);
 CLC_TABLE_FUNCTION(double, COSH_TBL_HEAD, cosh_tbl_head);
 CLC_TABLE_FUNCTION(double, COSH_TBL_TAIL, cosh_tbl_tail);

+DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
+    224, 241, 27,  193, 12,  88,  33,  116, 53,  126, 196, 126, 237, 175, 169,
+    75,  74,  41,  222, 231, 28,  244, 236, 197, 151, 175, 31,  235, 158, 212,
+    181, 168, 127, 121, 154, 253, 24,  61,  221, 38,  44,  159, 60,  251, 217,
+    180, 125, 180, 41,  104, 45,  70,  188, 188, 63,  96,  22,  120, 255, 95,
+    226, 127, 236, 160, 228, 247, 46,  126, 17,  114, 210, 231, 76,  13,  230,
+    88,  71,  230, 4,   249, 125, 209, 154, 192, 113, 166, 19,  18,  237, 186,
+    212, 215, 8,   162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72,  70,
+    39,  168, 187, 36,  25,  128, 75,  55,  9,   233, 184, 145, 220, 134, 21,
+    239, 122, 175, 142, 69,  249, 7,   65,  14,  241, 100, 86,  138, 109, 3,
+    119, 211, 212, 71,  95,  157, 240, 167, 84,  16,  57,  185, 13,  230, 139,
+    2,   0,   0,   0,   0,   0,   0,   0};
+
+_CLC_DEF _CLC_OVERLOAD ulong TABLE_MANGLE(pibits_tbl)(int idx) {
+  return *(__constant ulong *)(PIBITS_TBL + idx);
+}
+_CLC_DEF _CLC_OVERLOAD ulong2 TABLE_MANGLE(pibits_tbl)(int2 idx) {
+  return (ulong2){*(__constant ulong *)(PIBITS_TBL + idx.s0),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s1)};
+}
+_CLC_DEF _CLC_OVERLOAD ulong3 TABLE_MANGLE(pibits_tbl)(int3 idx) {
+  return (ulong3){*(__constant ulong *)(PIBITS_TBL + idx.s0),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s1),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s2)};
+}
+_CLC_DEF _CLC_OVERLOAD ulong4 TABLE_MANGLE(pibits_tbl)(int4 idx) {
+  return (ulong4){*(__constant ulong *)(PIBITS_TBL + idx.s0),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s1),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s2),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s3)};
+}
+_CLC_DEF _CLC_OVERLOAD ulong8 TABLE_MANGLE(pibits_tbl)(int8 idx) {
+  return (ulong8){*(__constant ulong *)(PIBITS_TBL + idx.s0),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s1),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s2),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s3),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s4),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s5),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s6),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s7)};
+}
+_CLC_DEF _CLC_OVERLOAD ulong16 TABLE_MANGLE(pibits_tbl)(int16 idx) {
+  return (ulong16){*(__constant ulong *)(PIBITS_TBL + idx.s0),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s1),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s2),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s3),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s4),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s5),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s6),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s7),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s8),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s9),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sA),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sB),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sC),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sD),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sE),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sF)};
+}
+
 #endif // cl_khr_fp64
--- a/libclc/clspv/lib/SOURCES
+++ b/libclc/clspv/lib/SOURCES
@@ -66,10 +66,8 @@ subnormal_config.cl
 ../../generic/lib/math/rootn.cl
 ../../generic/lib/math/sin.cl
 ../../generic/lib/math/sincos.cl
-../../generic/lib/math/sincos_helpers.cl
 ../../generic/lib/math/sinh.cl
 ../../generic/lib/math/sinpi.cl
-../../generic/lib/math/tables.cl
 ../../generic/lib/math/tan.cl
 ../../generic/lib/math/tanh.cl
 ../../generic/lib/math/tanpi.cl
--- a/libclc/generic/include/clc/math/sincos.h
+++ b/libclc/generic/include/clc/math/sincos.h
@@ -6,5 +6,7 @@
 //
 //===----------------------------------------------------------------------===//

-#define __CLC_BODY <clc/math/sincos.inc>
+#define __CLC_BODY <clc/math/unary_decl_with_ptr.inc>
+#define __CLC_FUNCTION __clc_sincos
 #include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
--- a/libclc/generic/include/clc/math/sincos.inc
+++ b/libclc/generic/include/clc/math/sincos.inc
@@ -1,14 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE sincos(__CLC_GENTYPE x,
-                                             global __CLC_GENTYPE *cosval);
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE sincos(__CLC_GENTYPE x,
-                                             local __CLC_GENTYPE *cosval);
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE sincos(__CLC_GENTYPE x,
-                                             private __CLC_GENTYPE *cosval);
--- a/libclc/generic/lib/SOURCES
+++ b/libclc/generic/lib/SOURCES
@@ -131,7 +131,6 @@ math/native_rsqrt.cl
 math/native_sin.cl
 math/native_sqrt.cl
 math/native_tan.cl
-math/tables.cl
 math/nextafter.cl
 math/pow.cl
 math/pown.cl
@@ -144,7 +143,6 @@ math/round.cl
 math/rsqrt.cl
 math/sin.cl
 math/sincos.cl
-math/sincos_helpers.cl
 math/sinh.cl
 math/sinpi.cl
 math/sqrt.cl
--- a/libclc/generic/lib/math/clc_tan.cl
+++ b/libclc/generic/lib/math/clc_tan.cl
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//

-#include "sincos_helpers.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
--- a/libclc/generic/lib/math/cos.cl
+++ b/libclc/generic/lib/math/cos.cl
@@ -6,45 +6,9 @@
 //
 //===----------------------------------------------------------------------===//

-#include "sincos_helpers.h"
 #include <clc/clc.h>
-#include <clc/clc_convert.h>
-#include <clc/clcmacro.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/math.h>
-#include <clc/relational/clc_isinf.h>
-#include <clc/relational/clc_isnan.h>
-#include <clc/relational/clc_select.h>
+#include <clc/math/clc_cos.h>

-// FP32 and FP16 versions.
-#define __CLC_BODY <cos.inc>
+#define FUNCTION cos
+#define __CLC_BODY <clc/shared/unary_def.inc>
 #include <clc/math/gentype.inc>
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_OVERLOAD _CLC_DEF double cos(double x) {
-    x = fabs(x);
-
-    double r, rr;
-    int regn;
-
-    if (x < 0x1.0p+47)
-        __clc_remainder_piby2_medium(x, &r, &rr, &regn);
-    else
-        __clc_remainder_piby2_large(x, &r, &rr, &regn);
-
-    double2 sc = __clc_sincos_piby4(r, rr);
-    sc.lo = -sc.lo;
-
-    int2 c = as_int2(regn & 1 ? sc.lo : sc.hi);
-    c.hi ^= (regn > 1) << 31;
-
-    return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c);
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double);
-
-#endif
--- a/libclc/generic/lib/math/cos.inc
+++ b/libclc/generic/lib/math/cos.inc
@@ -1,34 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN cos(__CLC_FLOATN x) {
-  __CLC_FLOATN absx = __clc_fabs(x);
-
-  __CLC_FLOATN r0, r1;
-  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
-
-  __CLC_FLOATN ss = -__clc_sinf_piby4(r0, r1);
-  __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
-
-  __CLC_FLOATN c = (regn & 1) != 0 ? ss : cc;
-  c = __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31));
-
-  c = __clc_select(c, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
-
-  return c;
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE cos(__CLC_GENTYPE x) {
-  return __CLC_CONVERT_GENTYPE(cos(__CLC_CONVERT_FLOATN(x)));
-}
-
-#endif
--- a/libclc/generic/lib/math/sin.cl
+++ b/libclc/generic/lib/math/sin.cl
@@ -6,44 +6,9 @@
 //
 //===----------------------------------------------------------------------===//

-#include "sincos_helpers.h"
 #include <clc/clc.h>
-#include <clc/clc_convert.h>
-#include <clc/clcmacro.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/math.h>
-#include <clc/relational/clc_isinf.h>
-#include <clc/relational/clc_isnan.h>
-#include <clc/relational/clc_select.h>
+#include <clc/math/clc_sin.h>

-// FP32 and FP16 versions.
-#define __CLC_BODY <sin.inc>
+#define FUNCTION sin
+#define __CLC_BODY <clc/shared/unary_def.inc>
 #include <clc/math/gentype.inc>
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_OVERLOAD _CLC_DEF double sin(double x) {
-    double y = fabs(x);
-
-    double r, rr;
-    int regn;
-
-    if (y < 0x1.0p+47)
-        __clc_remainder_piby2_medium(y, &r, &rr, &regn);
-    else
-        __clc_remainder_piby2_large(y, &r, &rr, &regn);
-
-    double2 sc = __clc_sincos_piby4(r, rr);
-
-    int2 s = as_int2(regn & 1 ? sc.hi : sc.lo);
-    s.hi ^= ((regn > 1) << 31) ^ ((x < 0.0) << 31);
-
-    return  isinf(x) | isnan(x) ? as_double(QNANBITPATT_DP64) : as_double(s);
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sin, double);
-
-#endif
--- a/libclc/generic/lib/math/sin.inc
+++ b/libclc/generic/lib/math/sin.inc
@@ -1,38 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN sin(__CLC_FLOATN x) {
-  __CLC_FLOATN absx = __clc_fabs(x);
-
-  __CLC_FLOATN r0, r1;
-  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
-
-  __CLC_FLOATN ss = __clc_sinf_piby4(r0, r1);
-  __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
-
-  __CLC_FLOATN s = (regn & 1) != 0 ? cc : ss;
-  s = __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^
-                      (__CLC_AS_INTN(x) ^ __CLC_AS_INTN(absx)));
-
-  s = __clc_select(s, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
-
-  // Subnormals
-  s = x == 0.0f ? x : s;
-
-  return s;
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE sin(__CLC_GENTYPE x) {
-  return __CLC_CONVERT_GENTYPE(sin(__CLC_CONVERT_FLOATN(x)));
-}
-
-#endif
--- a/libclc/generic/lib/math/sincos.cl
+++ b/libclc/generic/lib/math/sincos.cl
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//

 #include <clc/clc.h>
+#include <clc/math/clc_sincos.h>

-#define __CLC_BODY <sincos.inc>
+#define FUNCTION sincos
+#define __CLC_BODY <clc/math/unary_def_with_ptr.inc>
 #include <clc/math/gentype.inc>
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@@ -1,285 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "sincos_helpers.h"
-#include <clc/clc.h>
-#include <clc/integer/clc_clz.h>
-#include <clc/integer/clc_mul_hi.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_trunc.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/shared/clc_max.h>
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-#define bytealign(src0, src1, src2)                                            \
-  ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8)))
-
-// Reduction for medium sized arguments
-_CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r,
-                                           private double *rr,
-                                           private int *regn) {
-  // How many pi/2 is x a multiple of?
-  const double two_by_pi = 0x1.45f306dc9c883p-1;
-  double dnpi2 = __clc_trunc(__clc_fma(x, two_by_pi, 0.5));
-
-  const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
-  const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
-  const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
-
-  // Compute product of npi2 with 159 bits of 2/pi
-  double p_hh = piby2_h * dnpi2;
-  double p_ht = __clc_fma(piby2_h, dnpi2, -p_hh);
-  double p_mh = piby2_m * dnpi2;
-  double p_mt = __clc_fma(piby2_m, dnpi2, -p_mh);
-  double p_th = piby2_t * dnpi2;
-  double p_tt = __clc_fma(piby2_t, dnpi2, -p_th);
-
-  // Reduce to 159 bits
-  double ph = p_hh;
-  double pm = p_ht + p_mh;
-  double t = p_mh - (pm - p_ht);
-  double pt = p_th + t + p_mt + p_tt;
-  t = ph + pm;
-  pm = pm - (t - ph);
-  ph = t;
-  t = pm + pt;
-  pt = pt - (t - pm);
-  pm = t;
-
-  // Subtract from x
-  t = x + ph;
-  double qh = t + pm;
-  double qt = pm - (qh - t) + pt;
-
-  *r = qh;
-  *rr = qt;
-  *regn = (int)(long)dnpi2 & 0x3;
-}
-
-// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
-// extra precision, and return the result in r, rr.
-// Return value "regn" tells how many lots of pi/2 were subtracted
-// from x to put it in the range [-pi/4,pi/4], mod 4.
-
-_CLC_DEF void __clc_remainder_piby2_large(double x, private double *r,
-                                          private double *rr,
-                                          private int *regn) {
-
-  long ux = as_long(x);
-  int e = (int)(ux >> 52) - 1023;
-  int i = __clc_max(23, (e >> 3) + 17);
-  int j = 150 - i;
-  int j16 = j & ~0xf;
-  double fract_temp;
-
-  // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary
-  // byte boundary
-  uint4 q0 = USE_TABLE(pibits_tbl, j16);
-  uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
-  uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
-
-  int k = (j >> 2) & 0x3;
-  int4 c = (int4)k == (int4)(0, 1, 2, 3);
-
-  uint u0, u1, u2, u3, u4, u5, u6;
-
-  u0 = c.s1 ? q0.s1 : q0.s0;
-  u0 = c.s2 ? q0.s2 : u0;
-  u0 = c.s3 ? q0.s3 : u0;
-
-  u1 = c.s1 ? q0.s2 : q0.s1;
-  u1 = c.s2 ? q0.s3 : u1;
-  u1 = c.s3 ? q1.s0 : u1;
-
-  u2 = c.s1 ? q0.s3 : q0.s2;
-  u2 = c.s2 ? q1.s0 : u2;
-  u2 = c.s3 ? q1.s1 : u2;
-
-  u3 = c.s1 ? q1.s0 : q0.s3;
-  u3 = c.s2 ? q1.s1 : u3;
-  u3 = c.s3 ? q1.s2 : u3;
-
-  u4 = c.s1 ? q1.s1 : q1.s0;
-  u4 = c.s2 ? q1.s2 : u4;
-  u4 = c.s3 ? q1.s3 : u4;
-
-  u5 = c.s1 ? q1.s2 : q1.s1;
-  u5 = c.s2 ? q1.s3 : u5;
-  u5 = c.s3 ? q2.s0 : u5;
-
-  u6 = c.s1 ? q1.s3 : q1.s2;
-  u6 = c.s2 ? q2.s0 : u6;
-  u6 = c.s3 ? q2.s1 : u6;
-
-  uint v0 = bytealign(u1, u0, j);
-  uint v1 = bytealign(u2, u1, j);
-  uint v2 = bytealign(u3, u2, j);
-  uint v3 = bytealign(u4, u3, j);
-  uint v4 = bytealign(u5, u4, j);
-  uint v5 = bytealign(u6, u5, j);
-
-  // Place those 192 bits in 4 48-bit doubles along with correct exponent
-  // If i > 1018 we would get subnormals so we scale p up and x down to get the
-  // same product
-  i = 2 + 8 * i;
-  x *= i > 1018 ? 0x1.0p-136 : 1.0;
-  i -= i > 1018 ? 136 : 0;
-
-  uint ua = (uint)(1023 + 52 - i) << 20;
-  double a = as_double((uint2)(0, ua));
-  double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
-  ua += 0x03000000U;
-  a = as_double((uint2)(0, ua));
-  double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
-  ua += 0x03000000U;
-  a = as_double((uint2)(0, ua));
-  double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
-  ua += 0x03000000U;
-  a = as_double((uint2)(0, ua));
-  double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
-
-  // Exact multiply
-  double f0h = p0 * x;
-  double f0l = __clc_fma(p0, x, -f0h);
-  double f1h = p1 * x;
-  double f1l = __clc_fma(p1, x, -f1h);
-  double f2h = p2 * x;
-  double f2l = __clc_fma(p2, x, -f2h);
-  double f3h = p3 * x;
-  double f3l = __clc_fma(p3, x, -f3h);
-
-  // Accumulate product into 4 doubles
-  double s, t;
-
-  double f3 = f3h + f2h;
-  t = f2h - (f3 - f3h);
-  s = f3l + t;
-  t = t - (s - f3l);
-
-  double f2 = s + f1h;
-  t = f1h - (f2 - s) + t;
-  s = f2l + t;
-  t = t - (s - f2l);
-
-  double f1 = s + f0h;
-  t = f0h - (f1 - s) + t;
-  s = f1l + t;
-
-  double f0 = s + f0l;
-
-  // Strip off unwanted large integer bits
-  f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
-  f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
-
-  // Compute least significant integer bits
-  t = f3 + f2;
-  double di = t - fract(t, &fract_temp);
-  i = (float)di;
-
-  // Shift out remaining integer part
-  f3 -= di;
-  s = f3 + f2;
-  t = f2 - (s - f3);
-  f3 = s;
-  f2 = t;
-  s = f2 + f1;
-  t = f1 - (s - f2);
-  f2 = s;
-  f1 = t;
-  f1 += f0;
-
-  // Subtract 1 if fraction is >= 0.5, and update regn
-  int g = f3 >= 0.5;
-  i += g;
-  f3 -= (float)g;
-
-  // Shift up bits
-  s = f3 + f2;
-  t = f2 - (s - f3);
-  f3 = s;
-  f2 = t + f1;
-
-  // Multiply precise fraction by pi/2 to get radians
-  const double p2h = 7074237752028440.0 / 0x1.0p+52;
-  const double p2t = 4967757600021510.0 / 0x1.0p+106;
-
-  double rhi = f3 * p2h;
-  double rlo = __clc_fma(f2, p2h, __clc_fma(f3, p2t, __clc_fma(f3, p2h, -rhi)));
-
-  *r = rhi + rlo;
-  *rr = rlo - (*r - rhi);
-  *regn = i & 0x3;
-}
-
-_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
-  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-  //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-  //                      = x * f(w)
-  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-  // We use a minimax approximation of (f(w) - 1) / w
-  // because this produces an expansion in even powers of x.
-  // If xx (the tail of x) is non-zero, we add a correction
-  // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
-  // is an approximation to cos(x)*sin(xx) valid because
-  // xx is tiny relative to x.
-
-  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-  //                      = f(w)
-  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-  // because this produces an expansion in even powers of x.
-  // If xx (the tail of x) is non-zero, we subtract a correction
-  // term g(x,xx) = x*xx to the result, where g(x,xx)
-  // is an approximation to sin(x)*sin(xx) valid because
-  // xx is tiny relative to x.
-
-  const double sc1 = -0.166666666666666646259241729;
-  const double sc2 = 0.833333333333095043065222816e-2;
-  const double sc3 = -0.19841269836761125688538679e-3;
-  const double sc4 = 0.275573161037288022676895908448e-5;
-  const double sc5 = -0.25051132068021699772257377197e-7;
-  const double sc6 = 0.159181443044859136852668200e-9;
-
-  const double cc1 = 0.41666666666666665390037e-1;
-  const double cc2 = -0.13888888888887398280412e-2;
-  const double cc3 = 0.248015872987670414957399e-4;
-  const double cc4 = -0.275573172723441909470836e-6;
-  const double cc5 = 0.208761463822329611076335e-8;
-  const double cc6 = -0.113826398067944859590880e-10;
-
-  double x2 = x * x;
-  double x3 = x2 * x;
-  double r = 0.5 * x2;
-  double t = 1.0 - r;
-
-  double sp = __clc_fma(
-      __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
-
-  double cp =
-      t +
-      __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5),
-                                                        x2, cc4),
-                                              x2, cc3),
-                                    x2, cc2),
-                          x2, cc1),
-                x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));
-
-  double2 ret;
-  ret.lo =
-      x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx));
-  ret.hi = cp;
-
-  return ret;
-}
-
-#endif
--- a/libclc/generic/lib/math/sincos_helpers.h
+++ b/libclc/generic/lib/math/sincos_helpers.h
@@ -1,24 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/clcfunc.h>
-#include <clc/clctypes.h>
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DECL void __clc_remainder_piby2_medium(double x, private double *r,
-                                            private double *rr,
-                                            private int *regn);
-_CLC_DECL void __clc_remainder_piby2_large(double x, private double *r,
-                                           private double *rr,
-                                           private int *regn);
-_CLC_DECL double2 __clc_sincos_piby4(double x, double xx);
-
-#endif
--- a/libclc/generic/lib/math/tables.cl
+++ b/libclc/generic/lib/math/tables.cl
@@ -1,30 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/clc.h>
-
-#include <clc/math/tables.h>
-
-DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
-    224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
-    169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
-    235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
-    44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
-    188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
-    46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
-    125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
-    162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
-    168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
-    239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
-    3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
-    230, 139, 2, 0, 0, 0, 0, 0, 0, 0
-};
-
-uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) {
-    return *(__constant uint4 *)(PIBITS_TBL + idx);
-}
--- a/libclc/spirv/lib/SOURCES
+++ b/libclc/spirv/lib/SOURCES
@@ -55,7 +55,6 @@ math/fma.cl
 ../../generic/lib/math/log2.cl
 ../../generic/lib/math/logb.cl
 ../../generic/lib/math/modf.cl
-../../generic/lib/math/tables.cl
 ../../generic/lib/math/pow.cl
 ../../generic/lib/math/pown.cl
 ../../generic/lib/math/powr.cl
@@ -64,7 +63,6 @@ math/fma.cl
 ../../generic/lib/math/rootn.cl
 ../../generic/lib/math/sin.cl
 ../../generic/lib/math/sincos.cl
-../../generic/lib/math/sincos_helpers.cl
 ../../generic/lib/math/sinh.cl
 ../../generic/lib/math/sinpi.cl
 ../../generic/lib/math/clc_tan.cl