[libc][gpu] Add Atan2 Benchmarks (#104708)

This PR adds benchmarking for `atan2()`, `__nv_atan2()`, and `__ocml_atan2_f64()` using the same setup as `sin()`. This PR also adds support for throughout bencmarking for functions with 2 inputs.
2024-08-18 13:50:30 -04:00
parent 5c13f9aea2
commit deb6b45c32
6 changed files with 137 additions and 5 deletions
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -115,7 +115,7 @@ void print_results(Benchmark *b) {
  cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);

  LIBC_NAMESPACE::printf(
-      "%-20s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
+      "%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
      b->get_test_name().data(), result.cycles, result.min, result.max,
      result.total_iterations, result.total_time, time_unit,
      static_cast<uint64_t>(result.standard_deviation), num_threads);
@@ -127,7 +127,7 @@ void print_header() {
                         benchmarks[0]->get_suite_name().data());
  LIBC_NAMESPACE::printf("%s", RESET);
  cpp::string titles =
-      "Benchmark            |  Cycles |     Min |     Max | "
+      "Benchmark                |  Cycles |     Min |     Max | "
      "Iterations | Time / Iteration |   Stddev |  Threads |\n";
  LIBC_NAMESPACE::printf(titles.data());

--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -146,10 +146,8 @@ template <typename T> class MathPerf {
      cpp::numeric_limits<StorageType>::max();

 public:
-  typedef T Func(T);
-
  template <size_t N = 1>
-  static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
+  static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
    cpp::array<T, N> inputs;
    for (size_t i = 0; i < N; ++i)
      inputs[i] = get_rand_input<T>(min_exp, max_exp);
@@ -158,6 +156,23 @@ public:

    return total_time / N;
  }
+
+  // Throughput benchmarking for functions that take 2 inputs.
+  template <size_t N = 1>
+  static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
+                                          int arg1_max_exp, int arg2_min_exp,
+                                          int arg2_max_exp) {
+    cpp::array<T, N> inputs1;
+    cpp::array<T, N> inputs2;
+    for (size_t i = 0; i < N; ++i) {
+      inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
+      inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
+    }
+
+    uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
+
+    return total_time / N;
+  }
 };

 } // namespace benchmarks
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -43,3 +43,22 @@ add_benchmark(
  LOADER_ARGS
    --threads 64
 )
+
+add_benchmark(
+  atan2_benchmark
+  SUITE
+    libc-gpu-math-benchmarks
+  SRCS
+    atan2_benchmark.cpp
+  DEPENDS
+    libc.src.math.atan2
+    libc.src.stdlib.srand
+    libc.src.stdlib.rand
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.array
+  COMPILE_OPTIONS
+    ${math_benchmark_flags}
+  LOADER_ARGS
+    --threads 64
+)
--- a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
@@ -0,0 +1,47 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/math/atan2.h"
+#include "src/stdlib/rand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+#include "src/math/amdgpu/declarations.h"
+#endif
+
+#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                      \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range<   \
+        N>(Func, MIN_EXP, MAX_EXP, MIN_EXP, MAX_EXP);                          \
+  }
+
+#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP)                                 \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1,                   \
+                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1));    \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_128,                 \
+                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128));  \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1024,                \
+                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_4096,                \
+                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(double, Atan2, LIBC_NAMESPACE::atan2, -1023, 1023);
+BENCH(double, Atan2TwoPi, LIBC_NAMESPACE::atan2, -10, 3);
+BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
+BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);
+
+#ifdef NVPTX_MATH_FOUND
+BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
+BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
+BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
+BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
+BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
+BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
+BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
+#endif
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -130,6 +130,31 @@ throughput(F f, const cpp::array<T, N> &inputs) {
  return stop - start;
 }

+// Provides throughput benchmarking for 2 arguments (e.g. atan2())
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
+    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+  asm("" ::"v"(&inputs1), "v"(&inputs2));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"s"(start));
+
+  for (size_t i = 0; i < inputs1.size(); i++) {
+    auto result = f(inputs1[i], inputs2[i]);
+
+    asm("" ::"v"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  gpu::memory_fence();
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
 } // namespace LIBC_NAMESPACE_DECL

 #endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -121,6 +121,32 @@ throughput(F f, const cpp::array<T, N> &inputs) {
  // Return the time elapsed.
  return stop - start;
 }
+
+// Provides throughput benchmarking for 2 arguments (e.g. atan2())
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
+    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+  asm("" ::"r"(&inputs1), "r"(&inputs2));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"llr"(start));
+
+  uint64_t result;
+  for (size_t i = 0; i < inputs1.size(); i++) {
+    result = f(inputs1[i], inputs2[i]);
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  gpu::memory_fence();
+  asm("" ::"r"(stop));
+  volatile auto output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
 } // namespace LIBC_NAMESPACE_DECL

 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX