[OpenMP] Remove use of '__AMDGCN_WAVEFRONT_SIZE' (#113156)

Summary: This is going to be deprecated in https://github.com/llvm/llvm-project/pull/112849. This patch ports it to use the builtin instead. This isn't a compile constant, so it could slightly negatively affect codegen. There really should be an IR pass to turn it into a constant if the function has known attributes. Using the builtin is correct when we just do it for knowing the size like we do here. Obviously guarding w32/w64 code with this check would be broken.
2024-11-25 07:38:28 -06:00
parent 52755ac253
commit 506ca19dc9
3 changed files with 34 additions and 19 deletions
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -25,7 +25,6 @@ namespace ompx {
 namespace impl {

 // Forward declarations defined to be defined for AMDGCN and NVPTX.
-const llvm::omp::GV &getGridValue();
 LaneMaskTy activemask();
 LaneMaskTy lanemaskLT();
 LaneMaskTy lanemaskGT();
@@ -37,15 +36,14 @@ uint32_t getBlockIdInKernel(int32_t Dim);
 uint32_t getNumberOfBlocksInKernel(int32_t Dim);
 uint32_t getWarpIdInBlock();
 uint32_t getNumberOfWarpsInBlock();
+uint32_t getWarpSize();

 /// AMDGCN Implementation
 ///
 ///{
 #pragma omp begin declare variant match(device = {arch(amdgcn)})

-const llvm::omp::GV &getGridValue() {
-  return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
-}
+uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }

 uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
  switch (Dim) {
@@ -152,7 +150,7 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
  UNREACHABLE("Dim outside range!");
 }

-const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
+uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }

 LaneMaskTy activemask() { return __nvvm_activemask(); }

@@ -219,8 +217,6 @@ uint32_t getNumberOfWarpsInBlock() {
 #pragma omp end declare variant
 ///}

-uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
-
 } // namespace impl
 } // namespace ompx

--- a/offload/test/offloading/ompx_bare_ballot_sync.c
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -8,22 +8,33 @@
 #include <stdio.h>
 #include <stdlib.h>

+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
 int main(int argc, char *argv[]) {
  const int num_blocks = 1;
  const int block_size = 256;
  const int N = num_blocks * block_size;
  int *res = (int *)malloc(N * sizeof(int));

-#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \
-        map(from: res[0:N])
+#pragma omp target teams ompx_bare num_teams(num_blocks)                       \
+    thread_limit(block_size) map(from : res[0 : N])
  {
    int tid = ompx_thread_id_x();
    uint64_t mask = ompx_ballot_sync(~0LU, tid & 0x1);
-#if defined __AMDGCN_WAVEFRONT_SIZE && __AMDGCN_WAVEFRONT_SIZE == 64
-    res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
-#else
-    res[tid] = mask == 0xaaaaaaaa;
-#endif
+    if (get_warp_size() == 64)
+      res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
+    else
+      res[tid] = mask == 0xaaaaaaaa;
  }

  for (int i = 0; i < N; ++i)
--- a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
+++ b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
@@ -10,6 +10,18 @@
 #include <ompx.h>
 #include <type_traits>

+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
 template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
 bool equal(T LHS, T RHS) {
  return LHS == RHS;
@@ -32,11 +44,7 @@ template <typename T> void test() {
  {
    int tid = ompx_thread_id_x();
    T val = ompx::shfl_down_sync(~0U, static_cast<T>(tid), 1);
-#ifdef __AMDGCN_WAVEFRONT_SIZE
-    int warp_size = __AMDGCN_WAVEFRONT_SIZE;
-#else
-    int warp_size = 32;
-#endif
+    int warp_size = get_warp_size();
    if ((tid & (warp_size - 1)) != warp_size - 1)
      res[tid] = equal(val, static_cast<T>(tid + 1));
    else