[libclc] Improving vector code generated from scalar code (#140008)
The previous method splits vector data into two halves. shuffle_vector concatenates the two results into a vector data of original size. This PR eliminates the use of shuffle_vector.
This commit is contained in:
@@ -14,100 +14,140 @@
|
||||
|
||||
#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \
|
||||
DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \
|
||||
return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \
|
||||
return (RET_TYPE##2)(FUNCTION(x.s0), FUNCTION(x.s1)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \
|
||||
return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \
|
||||
return (RET_TYPE##3)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) { \
|
||||
return (RET_TYPE##4)(FUNCTION(x.lo), FUNCTION(x.hi)); \
|
||||
return (RET_TYPE##4)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), \
|
||||
FUNCTION(x.s3)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) { \
|
||||
return (RET_TYPE##8)(FUNCTION(x.lo), FUNCTION(x.hi)); \
|
||||
return (RET_TYPE##8)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), \
|
||||
FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5), \
|
||||
FUNCTION(x.s6), FUNCTION(x.s7)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) { \
|
||||
return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \
|
||||
return (RET_TYPE##16)( \
|
||||
FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \
|
||||
FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7), \
|
||||
FUNCTION(x.s8), FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb), \
|
||||
FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se), FUNCTION(x.sf)); \
|
||||
}
|
||||
|
||||
#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \
|
||||
ARG2_TYPE) \
|
||||
DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \
|
||||
return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \
|
||||
return (RET_TYPE##2)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \
|
||||
return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \
|
||||
FUNCTION(x.z, y.z)); \
|
||||
return (RET_TYPE##3)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \
|
||||
FUNCTION(x.s2, y.s2)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y) { \
|
||||
return (RET_TYPE##4)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
|
||||
return (RET_TYPE##4)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \
|
||||
FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y) { \
|
||||
return (RET_TYPE##8)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
|
||||
return (RET_TYPE##8)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \
|
||||
FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \
|
||||
FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), \
|
||||
FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y) { \
|
||||
return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
|
||||
return (RET_TYPE##16)( \
|
||||
FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), \
|
||||
FUNCTION(x.s3, y.s3), FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), \
|
||||
FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7), FUNCTION(x.s8, y.s8), \
|
||||
FUNCTION(x.s9, y.s9), FUNCTION(x.sa, y.sa), FUNCTION(x.sb, y.sb), \
|
||||
FUNCTION(x.sc, y.sc), FUNCTION(x.sd, y.sd), FUNCTION(x.se, y.se), \
|
||||
FUNCTION(x.sf, y.sf)); \
|
||||
}
|
||||
|
||||
#define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \
|
||||
ARG2_TYPE) \
|
||||
DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \
|
||||
return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
|
||||
return (RET_TYPE##2)(FUNCTION(x, y.s0), FUNCTION(x, y.s1)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE##3 y) { \
|
||||
return (RET_TYPE##3)(FUNCTION(x, y.x), FUNCTION(x, y.y), \
|
||||
FUNCTION(x, y.z)); \
|
||||
return (RET_TYPE##3)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \
|
||||
FUNCTION(x, y.s2)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE##4 y) { \
|
||||
return (RET_TYPE##4)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
|
||||
return (RET_TYPE##4)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \
|
||||
FUNCTION(x, y.s2), FUNCTION(x, y.s3)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE##8 y) { \
|
||||
return (RET_TYPE##8)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
|
||||
return (RET_TYPE##8)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \
|
||||
FUNCTION(x, y.s2), FUNCTION(x, y.s3), \
|
||||
FUNCTION(x, y.s4), FUNCTION(x, y.s5), \
|
||||
FUNCTION(x, y.s6), FUNCTION(x, y.s7)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE##16 y) { \
|
||||
return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
|
||||
return (RET_TYPE##16)( \
|
||||
FUNCTION(x, y.s0), FUNCTION(x, y.s1), FUNCTION(x, y.s2), \
|
||||
FUNCTION(x, y.s3), FUNCTION(x, y.s4), FUNCTION(x, y.s5), \
|
||||
FUNCTION(x, y.s6), FUNCTION(x, y.s7), FUNCTION(x, y.s8), \
|
||||
FUNCTION(x, y.s9), FUNCTION(x, y.sa), FUNCTION(x, y.sb), \
|
||||
FUNCTION(x, y.sc), FUNCTION(x, y.sd), FUNCTION(x, y.se), \
|
||||
FUNCTION(x, y.sf)); \
|
||||
}
|
||||
|
||||
#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \
|
||||
ARG2_TYPE, ARG3_TYPE) \
|
||||
DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \
|
||||
ARG3_TYPE##2 z) { \
|
||||
return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \
|
||||
return (RET_TYPE##2)(FUNCTION(x.s0, y.s0, z.s0), \
|
||||
FUNCTION(x.s1, y.s1, z.s1)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, \
|
||||
ARG3_TYPE##3 z) { \
|
||||
return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \
|
||||
FUNCTION(x.z, y.z, z.z)); \
|
||||
return (RET_TYPE##3)(FUNCTION(x.s0, y.s0, z.s0), \
|
||||
FUNCTION(x.s1, y.s1, z.s1), \
|
||||
FUNCTION(x.s2, y.s2, z.s2)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, \
|
||||
ARG3_TYPE##4 z) { \
|
||||
return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo), \
|
||||
FUNCTION(x.hi, y.hi, z.hi)); \
|
||||
return (RET_TYPE##4)( \
|
||||
FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \
|
||||
FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, \
|
||||
ARG3_TYPE##8 z) { \
|
||||
return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo), \
|
||||
FUNCTION(x.hi, y.hi, z.hi)); \
|
||||
return (RET_TYPE##8)( \
|
||||
FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \
|
||||
FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3), \
|
||||
FUNCTION(x.s4, y.s4, z.s4), FUNCTION(x.s5, y.s5, z.s5), \
|
||||
FUNCTION(x.s6, y.s6, z.s6), FUNCTION(x.s7, y.s7, z.s7)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y, \
|
||||
ARG3_TYPE##16 z) { \
|
||||
return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo), \
|
||||
FUNCTION(x.hi, y.hi, z.hi)); \
|
||||
return (RET_TYPE##16)( \
|
||||
FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \
|
||||
FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3), \
|
||||
FUNCTION(x.s4, y.s4, z.s4), FUNCTION(x.s5, y.s5, z.s5), \
|
||||
FUNCTION(x.s6, y.s6, z.s6), FUNCTION(x.s7, y.s7, z.s7), \
|
||||
FUNCTION(x.s8, y.s8, z.s8), FUNCTION(x.s9, y.s9, z.s9), \
|
||||
FUNCTION(x.sa, y.sa, z.sa), FUNCTION(x.sb, y.sb, z.sb), \
|
||||
FUNCTION(x.sc, y.sc, z.sc), FUNCTION(x.sd, y.sd, z.sd), \
|
||||
FUNCTION(x.se, y.se, z.se), FUNCTION(x.sf, y.sf, z.sf)); \
|
||||
}
|
||||
|
||||
#define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \
|
||||
@@ -115,48 +155,53 @@
|
||||
DECLSPEC __CLC_XCONCAT(RET_TYPE, 2) \
|
||||
FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 2) x, \
|
||||
ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) * y) { \
|
||||
return (__CLC_XCONCAT(RET_TYPE, 2))( \
|
||||
FUNCTION(x.x, (ADDR_SPACE ARG2_TYPE *)y), \
|
||||
FUNCTION(x.y, \
|
||||
(ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 1))); \
|
||||
ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \
|
||||
return (__CLC_XCONCAT(RET_TYPE, 2))(FUNCTION(x.s0, ptr), \
|
||||
FUNCTION(x.s1, ptr + 1)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC __CLC_XCONCAT(RET_TYPE, 3) \
|
||||
FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 3) x, \
|
||||
ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 3) * y) { \
|
||||
return (__CLC_XCONCAT(RET_TYPE, 3))( \
|
||||
FUNCTION(x.x, (ADDR_SPACE ARG2_TYPE *)y), \
|
||||
FUNCTION(x.y, \
|
||||
(ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 1)), \
|
||||
FUNCTION(x.z, \
|
||||
(ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 2))); \
|
||||
ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \
|
||||
return (__CLC_XCONCAT(RET_TYPE, 3))(FUNCTION(x.s0, ptr), \
|
||||
FUNCTION(x.s1, ptr + 1), \
|
||||
FUNCTION(x.s2, ptr + 2)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC __CLC_XCONCAT(RET_TYPE, 4) \
|
||||
FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 4) x, \
|
||||
ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) * y) { \
|
||||
ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \
|
||||
return (__CLC_XCONCAT(RET_TYPE, 4))( \
|
||||
FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) *)y), \
|
||||
FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \
|
||||
ARG2_TYPE, 2) *)((ADDR_SPACE ARG2_TYPE *)y + 2))); \
|
||||
FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \
|
||||
FUNCTION(x.s3, ptr + 3)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC __CLC_XCONCAT(RET_TYPE, 8) \
|
||||
FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 8) x, \
|
||||
ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) * y) { \
|
||||
ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \
|
||||
return (__CLC_XCONCAT(RET_TYPE, 8))( \
|
||||
FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) *)y), \
|
||||
FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \
|
||||
ARG2_TYPE, 4) *)((ADDR_SPACE ARG2_TYPE *)y + 4))); \
|
||||
FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \
|
||||
FUNCTION(x.s3, ptr + 3), FUNCTION(x.s4, ptr + 4), \
|
||||
FUNCTION(x.s5, ptr + 5), FUNCTION(x.s6, ptr + 6), \
|
||||
FUNCTION(x.s7, ptr + 7)); \
|
||||
} \
|
||||
\
|
||||
DECLSPEC __CLC_XCONCAT(RET_TYPE, 16) \
|
||||
FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 16) x, \
|
||||
ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 16) * y) { \
|
||||
ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \
|
||||
return (__CLC_XCONCAT(RET_TYPE, 16))( \
|
||||
FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) *)y), \
|
||||
FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \
|
||||
ARG2_TYPE, 8) *)((ADDR_SPACE ARG2_TYPE *)y + 8))); \
|
||||
FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \
|
||||
FUNCTION(x.s3, ptr + 3), FUNCTION(x.s4, ptr + 4), \
|
||||
FUNCTION(x.s5, ptr + 5), FUNCTION(x.s6, ptr + 6), \
|
||||
FUNCTION(x.s7, ptr + 7), FUNCTION(x.s8, ptr + 8), \
|
||||
FUNCTION(x.s9, ptr + 9), FUNCTION(x.sa, ptr + 10), \
|
||||
FUNCTION(x.sb, ptr + 11), FUNCTION(x.sc, ptr + 12), \
|
||||
FUNCTION(x.sd, ptr + 13), FUNCTION(x.se, ptr + 14), \
|
||||
FUNCTION(x.sf, ptr + 15)); \
|
||||
}
|
||||
|
||||
#define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, \
|
||||
|
||||
@@ -406,13 +406,13 @@ _CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_lgamma_r, float,
|
||||
#define v4 1.04222645593369134254e-01 /* 0x3FBAAE55, 0xD6537C88 */
|
||||
#define v5 3.21709242282423911810e-03 /* 0x3F6A5ABB, 0x57D0CF61 */
|
||||
|
||||
#define s0 -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */
|
||||
#define s1 2.14982415960608852501e-01 /* 0x3FCB848B, 0x36E20878 */
|
||||
#define s2 3.25778796408930981787e-01 /* 0x3FD4D98F, 0x4F139F59 */
|
||||
#define s3 1.46350472652464452805e-01 /* 0x3FC2BB9C, 0xBEE5F2F7 */
|
||||
#define s4 2.66422703033638609560e-02 /* 0x3F9B481C, 0x7E939961 */
|
||||
#define s5 1.84028451407337715652e-03 /* 0x3F5E26B6, 0x7368F239 */
|
||||
#define s6 3.19475326584100867617e-05 /* 0x3F00BFEC, 0xDD17E945 */
|
||||
#define s0_d -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */
|
||||
#define s1_d 2.14982415960608852501e-01 /* 0x3FCB848B, 0x36E20878 */
|
||||
#define s2_d 3.25778796408930981787e-01 /* 0x3FD4D98F, 0x4F139F59 */
|
||||
#define s3_d 1.46350472652464452805e-01 /* 0x3FC2BB9C, 0xBEE5F2F7 */
|
||||
#define s4_d 2.66422703033638609560e-02 /* 0x3F9B481C, 0x7E939961 */
|
||||
#define s5_d 1.84028451407337715652e-03 /* 0x3F5E26B6, 0x7368F239 */
|
||||
#define s6_d 3.19475326584100867617e-05 /* 0x3F00BFEC, 0xDD17E945 */
|
||||
|
||||
#define r1 1.39200533467621045958e+00 /* 0x3FF645A7, 0x62C4AB74 */
|
||||
#define r2 7.21935547567138069525e-01 /* 0x3FE71A18, 0x93D3DCDC */
|
||||
@@ -530,10 +530,12 @@ _CLC_OVERLOAD _CLC_DEF double __clc_lgamma_r(double x, private int *ip) {
|
||||
__clc_fma(
|
||||
y,
|
||||
__clc_fma(
|
||||
y, __clc_fma(y, __clc_fma(y, __clc_fma(y, s6, s5), s4), s3),
|
||||
s2),
|
||||
s1),
|
||||
s0);
|
||||
y,
|
||||
__clc_fma(y, __clc_fma(y, __clc_fma(y, s6_d, s5_d), s4_d),
|
||||
s3_d),
|
||||
s2_d),
|
||||
s1_d),
|
||||
s0_d);
|
||||
double q = __clc_fma(
|
||||
y,
|
||||
__clc_fma(
|
||||
|
||||
Reference in New Issue
Block a user