[libclc] Optimize CLC vector relational builtins (#124537)

Clang knows how to perform relational operations on OpenCL vectors, so we don't need to use the Clang builtins. The builtins we were using didn't support vector types, so we were previously scalarizing. This commit generates the same LLVM fcmp operations as before, just without the scalarization.
2025-01-27 13:25:37 +00:00
parent ef54e0bbfb
commit 347fb208c1
8 changed files with 90 additions and 130 deletions
--- a/libclc/clc/include/clc/relational/relational.h
+++ b/libclc/clc/include/clc/relational/relational.h
@@ -142,4 +142,30 @@
  _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE,         \
                                        ARG1_TYPE)

+#define _CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(RET_TYPE, RET_TYPE_VEC, FUNCTION, \
+                                             ARG1_TYPE, ARG2_TYPE)             \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##2 FUNCTION(ARG1_TYPE##2 x,              \
+                                                  ARG2_TYPE##2 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##3 FUNCTION(ARG1_TYPE##3 x,              \
+                                                  ARG2_TYPE##3 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##4 FUNCTION(ARG1_TYPE##4 x,              \
+                                                  ARG2_TYPE##4 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##8 FUNCTION(ARG1_TYPE##8 x,              \
+                                                  ARG2_TYPE##8 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##16 FUNCTION(ARG1_TYPE##16 x,            \
+                                                   ARG2_TYPE##16 y) {          \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }
+
 #endif // __CLC_RELATIONAL_RELATIONAL_H__
--- a/libclc/clc/lib/generic/relational/clc_isequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_isequal.cl
@@ -1,44 +1,28 @@
 #include <clc/internal/clc.h>
+#include <clc/relational/relational.h>

-#define _CLC_DEFINE_ISEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)          \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
-    return (x == y);                                                           \
-  }
+#define _CLC_RELATIONAL_OP(X, Y) (X) == (Y)

-_CLC_DEFINE_ISEQUAL(int, __clc_isequal, float, float)
-_CLC_DEFINE_ISEQUAL(int2, __clc_isequal, float2, float2)
-_CLC_DEFINE_ISEQUAL(int3, __clc_isequal, float3, float3)
-_CLC_DEFINE_ISEQUAL(int4, __clc_isequal, float4, float4)
-_CLC_DEFINE_ISEQUAL(int8, __clc_isequal, float8, float8)
-_CLC_DEFINE_ISEQUAL(int16, __clc_isequal, float16, float16)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isequal, float, float)

 #ifdef cl_khr_fp64

 #pragma OPENCL EXTENSION cl_khr_fp64 : enable

-// The scalar version of __clc_isequal(double) returns an int, but the vector
-// versions return long.
-_CLC_DEFINE_ISEQUAL(int, __clc_isequal, double, double)
-_CLC_DEFINE_ISEQUAL(long2, __clc_isequal, double2, double2)
-_CLC_DEFINE_ISEQUAL(long3, __clc_isequal, double3, double3)
-_CLC_DEFINE_ISEQUAL(long4, __clc_isequal, double4, double4)
-_CLC_DEFINE_ISEQUAL(long8, __clc_isequal, double8, double8)
-_CLC_DEFINE_ISEQUAL(long16, __clc_isequal, double16, double16)
+// The scalar version of __clc_isequal(double, double) returns an int, but the
+// vector versions return long.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isequal, double, double)

 #endif
+
 #ifdef cl_khr_fp16

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

-// The scalar version of __clc_isequal(half) returns an int, but the vector
-// versions return short.
-_CLC_DEFINE_ISEQUAL(int, __clc_isequal, half, half)
-_CLC_DEFINE_ISEQUAL(short2, __clc_isequal, half2, half2)
-_CLC_DEFINE_ISEQUAL(short3, __clc_isequal, half3, half3)
-_CLC_DEFINE_ISEQUAL(short4, __clc_isequal, half4, half4)
-_CLC_DEFINE_ISEQUAL(short8, __clc_isequal, half8, half8)
-_CLC_DEFINE_ISEQUAL(short16, __clc_isequal, half16, half16)
+// The scalar version of __clc_isequal(half, half) returns an int, but the
+// vector versions return short.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isequal, half, half)

 #endif

-#undef _CLC_DEFINE_ISEQUAL
+#undef _CLC_RELATIONAL_OP
--- a/libclc/clc/lib/generic/relational/clc_isgreater.cl
+++ b/libclc/clc/lib/generic/relational/clc_isgreater.cl
@@ -1,12 +1,9 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>

-// Note: It would be nice to use __builtin_isgreater with vector inputs, but it
-// seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) > (Y)

-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreater, __builtin_isgreater, float,
-                              float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isgreater, float, float)

 #ifdef cl_khr_fp64

@@ -14,12 +11,7 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreater, __builtin_isgreater, float,

 // The scalar version of __clc_isgreater(double, double) returns an int, but the
 // vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreater(double x, double y) {
-  return __builtin_isgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreater, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isgreater, double, double)

 #endif

@@ -29,11 +21,8 @@ _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreater, double, double)

 // The scalar version of __clc_isgreater(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreater(half x, half y) {
-  return __builtin_isgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isgreater, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isgreater, half, half)

 #endif
+
+#undef _CLC_RELATIONAL_OP
--- a/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl
@@ -1,12 +1,10 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>

-// Note: It would be nice to use __builtin_isgreaterequal with vector inputs,
-// but it seems to only take scalar values as input, which will produce
-// incorrect output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) >= (Y)

-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreaterequal,
-                              __builtin_isgreaterequal, float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isgreaterequal, float,
+                                     float)

 #ifdef cl_khr_fp64

@@ -14,26 +12,20 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreaterequal,

 // The scalar version of __clc_isgreaterequal(double, double) returns an int,
 // but the vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreaterequal(double x, double y) {
-  return __builtin_isgreaterequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreaterequal, double,
-                                      double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isgreaterequal, double,
+                                     double)

 #endif
+
 #ifdef cl_khr_fp16

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

-// The scalar version of __clc_isgreaterequal(half, half) returns an int, but
+// The scalar version of __clc_isgreaterequal(half, hafl) returns an int, but
 // the vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreaterequal(half x, half y) {
-  return __builtin_isgreaterequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isgreaterequal, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isgreaterequal, half,
+                                     half)

 #endif
+
+#undef _CLC_RELATIONAL_OP
--- a/libclc/clc/lib/generic/relational/clc_isless.cl
+++ b/libclc/clc/lib/generic/relational/clc_isless.cl
@@ -1,37 +1,28 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>

-// Note: It would be nice to use __builtin_isless with vector inputs, but it
-// seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) < (Y)

-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isless, __builtin_isless, float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isless, float, float)

 #ifdef cl_khr_fp64

 #pragma OPENCL EXTENSION cl_khr_fp64 : enable

-// The scalar version of __clc_isless(double, double) returns an int, but the
-// vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isless(double x, double y) {
-  return __builtin_isless(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isless, double, double)
+// The scalar version of __clc_isless(double, double) returns an int, but
+// the vector versions return long.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isless, double, double)

 #endif
+
 #ifdef cl_khr_fp16

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

-// The scalar version of __clc_isless(half, half) returns an int, but the vector
-// versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isless(half x, half y) {
-  return __builtin_isless(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isless, half, half)
+// The scalar version of __clc_isless(half, half) returns an int, but the
+// vector versions return short.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isless, half, half)

 #endif
+
+#undef _CLC_RELATIONAL_OP
--- a/libclc/clc/lib/generic/relational/clc_islessequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_islessequal.cl
@@ -1,12 +1,9 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>

-// Note: It would be nice to use __builtin_islessequal with vector inputs, but
-// it seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) <= (Y)

-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessequal, __builtin_islessequal,
-                              float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_islessequal, float, float)

 #ifdef cl_khr_fp64

@@ -14,12 +11,8 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessequal, __builtin_islessequal,

 // The scalar version of __clc_islessequal(double, double) returns an int, but
 // the vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessequal(double x, double y) {
-  return __builtin_islessequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessequal, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_islessequal, double,
+                                     double)

 #endif

@@ -29,11 +22,8 @@ _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessequal, double, double)

 // The scalar version of __clc_islessequal(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessequal(half x, half y) {
-  return __builtin_islessequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_islessequal, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_islessequal, half, half)

 #endif
+
+#undef _CLC_RELATIONAL_OP
--- a/libclc/clc/lib/generic/relational/clc_islessgreater.cl
+++ b/libclc/clc/lib/generic/relational/clc_islessgreater.cl
@@ -1,12 +1,10 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>

-// Note: It would be nice to use __builtin_islessgreater with vector inputs, but
-// it seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) ((X) < (Y)) || ((X) > (Y))

-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessgreater, __builtin_islessgreater,
-                              float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_islessgreater, float,
+                                     float)

 #ifdef cl_khr_fp64

@@ -14,25 +12,20 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessgreater, __builtin_islessgreater,

 // The scalar version of __clc_islessgreater(double, double) returns an int, but
 // the vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessgreater(double x, double y) {
-  return __builtin_islessgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessgreater, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_islessgreater, double,
+                                     double)

 #endif
+
 #ifdef cl_khr_fp16

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

 // The scalar version of __clc_islessgreater(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessgreater(half x, half y) {
-  return __builtin_islessgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_islessgreater, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_islessgreater, half,
+                                     half)

 #endif
+
+#undef _CLC_RELATIONAL_OP
--- a/libclc/clc/lib/generic/relational/clc_isnotequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_isnotequal.cl
@@ -1,33 +1,28 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>

-#define _CLC_DEFINE_ISNOTEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)       \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
-    return (x != y);                                                           \
-  }
+#define _CLC_RELATIONAL_OP(X, Y) (X) != (Y)

-_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, float, float)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, __clc_isnotequal, float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isnotequal, float, float)

 #ifdef cl_khr_fp64
+
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable

 // The scalar version of __clc_isnotequal(double, double) returns an int, but
 // the vector versions return long.
-
-_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, double, double)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isnotequal, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isnotequal, double, double)

 #endif
+
 #ifdef cl_khr_fp16
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

 // The scalar version of __clc_isnotequal(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, half, half)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isnotequal, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isnotequal, half, half)

 #endif

-#undef _CLC_DEFINE_ISNOTEQUAL
+#undef _CLC_RELATIONAL_OP