[libc][gpu] Add Atan2 Benchmarks (#104708)
This PR adds benchmarking for `atan2()`, `__nv_atan2()`, and `__ocml_atan2_f64()` using the same setup as `sin()`. This PR also adds support for throughout bencmarking for functions with 2 inputs.
This commit is contained in:
@@ -115,7 +115,7 @@ void print_results(Benchmark *b) {
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
|
||||
|
||||
LIBC_NAMESPACE::printf(
|
||||
"%-20s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
|
||||
"%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
|
||||
b->get_test_name().data(), result.cycles, result.min, result.max,
|
||||
result.total_iterations, result.total_time, time_unit,
|
||||
static_cast<uint64_t>(result.standard_deviation), num_threads);
|
||||
@@ -127,7 +127,7 @@ void print_header() {
|
||||
benchmarks[0]->get_suite_name().data());
|
||||
LIBC_NAMESPACE::printf("%s", RESET);
|
||||
cpp::string titles =
|
||||
"Benchmark | Cycles | Min | Max | "
|
||||
"Benchmark | Cycles | Min | Max | "
|
||||
"Iterations | Time / Iteration | Stddev | Threads |\n";
|
||||
LIBC_NAMESPACE::printf(titles.data());
|
||||
|
||||
|
||||
@@ -146,10 +146,8 @@ template <typename T> class MathPerf {
|
||||
cpp::numeric_limits<StorageType>::max();
|
||||
|
||||
public:
|
||||
typedef T Func(T);
|
||||
|
||||
template <size_t N = 1>
|
||||
static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
|
||||
static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
|
||||
cpp::array<T, N> inputs;
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
inputs[i] = get_rand_input<T>(min_exp, max_exp);
|
||||
@@ -158,6 +156,23 @@ public:
|
||||
|
||||
return total_time / N;
|
||||
}
|
||||
|
||||
// Throughput benchmarking for functions that take 2 inputs.
|
||||
template <size_t N = 1>
|
||||
static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
|
||||
int arg1_max_exp, int arg2_min_exp,
|
||||
int arg2_max_exp) {
|
||||
cpp::array<T, N> inputs1;
|
||||
cpp::array<T, N> inputs2;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
|
||||
inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
|
||||
}
|
||||
|
||||
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
|
||||
|
||||
return total_time / N;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace benchmarks
|
||||
|
||||
@@ -43,3 +43,22 @@ add_benchmark(
|
||||
LOADER_ARGS
|
||||
--threads 64
|
||||
)
|
||||
|
||||
add_benchmark(
|
||||
atan2_benchmark
|
||||
SUITE
|
||||
libc-gpu-math-benchmarks
|
||||
SRCS
|
||||
atan2_benchmark.cpp
|
||||
DEPENDS
|
||||
libc.src.math.atan2
|
||||
libc.src.stdlib.srand
|
||||
libc.src.stdlib.rand
|
||||
libc.src.__support.FPUtil.fp_bits
|
||||
libc.src.__support.CPP.bit
|
||||
libc.src.__support.CPP.array
|
||||
COMPILE_OPTIONS
|
||||
${math_benchmark_flags}
|
||||
LOADER_ARGS
|
||||
--threads 64
|
||||
)
|
||||
|
||||
47
libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
Normal file
47
libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
Normal file
@@ -0,0 +1,47 @@
|
||||
#include "benchmarks/gpu/LibcGpuBenchmark.h"
|
||||
|
||||
#include "src/math/atan2.h"
|
||||
#include "src/stdlib/rand.h"
|
||||
|
||||
#ifdef NVPTX_MATH_FOUND
|
||||
#include "src/math/nvptx/declarations.h"
|
||||
#endif
|
||||
|
||||
#ifdef AMDGPU_MATH_FOUND
|
||||
#include "src/math/amdgpu/declarations.h"
|
||||
#endif
|
||||
|
||||
#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \
|
||||
[]() { \
|
||||
return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range< \
|
||||
N>(Func, MIN_EXP, MAX_EXP, MIN_EXP, MAX_EXP); \
|
||||
}
|
||||
|
||||
#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP) \
|
||||
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1, \
|
||||
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1)); \
|
||||
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_128, \
|
||||
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128)); \
|
||||
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1024, \
|
||||
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \
|
||||
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_4096, \
|
||||
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
|
||||
|
||||
BENCH(double, Atan2, LIBC_NAMESPACE::atan2, -1023, 1023);
|
||||
BENCH(double, Atan2TwoPi, LIBC_NAMESPACE::atan2, -10, 3);
|
||||
BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
|
||||
BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);
|
||||
|
||||
#ifdef NVPTX_MATH_FOUND
|
||||
BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
|
||||
BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
|
||||
BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
|
||||
BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
|
||||
#endif
|
||||
|
||||
#ifdef AMDGPU_MATH_FOUND
|
||||
BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
|
||||
BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
|
||||
BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
|
||||
BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
|
||||
#endif
|
||||
@@ -130,6 +130,31 @@ throughput(F f, const cpp::array<T, N> &inputs) {
|
||||
return stop - start;
|
||||
}
|
||||
|
||||
// Provides throughput benchmarking for 2 arguments (e.g. atan2())
|
||||
template <typename F, typename T, size_t N>
|
||||
[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
|
||||
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
|
||||
asm("" ::"v"(&inputs1), "v"(&inputs2));
|
||||
|
||||
gpu::memory_fence();
|
||||
uint64_t start = gpu::processor_clock();
|
||||
|
||||
asm("" ::"s"(start));
|
||||
|
||||
for (size_t i = 0; i < inputs1.size(); i++) {
|
||||
auto result = f(inputs1[i], inputs2[i]);
|
||||
|
||||
asm("" ::"v"(result));
|
||||
}
|
||||
|
||||
uint64_t stop = gpu::processor_clock();
|
||||
asm("" ::"s"(stop));
|
||||
gpu::memory_fence();
|
||||
|
||||
// Return the time elapsed.
|
||||
return stop - start;
|
||||
}
|
||||
|
||||
} // namespace LIBC_NAMESPACE_DECL
|
||||
|
||||
#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
|
||||
|
||||
@@ -121,6 +121,32 @@ throughput(F f, const cpp::array<T, N> &inputs) {
|
||||
// Return the time elapsed.
|
||||
return stop - start;
|
||||
}
|
||||
|
||||
// Provides throughput benchmarking for 2 arguments (e.g. atan2())
|
||||
template <typename F, typename T, size_t N>
|
||||
[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
|
||||
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
|
||||
asm("" ::"r"(&inputs1), "r"(&inputs2));
|
||||
|
||||
gpu::memory_fence();
|
||||
uint64_t start = gpu::processor_clock();
|
||||
|
||||
asm("" ::"llr"(start));
|
||||
|
||||
uint64_t result;
|
||||
for (size_t i = 0; i < inputs1.size(); i++) {
|
||||
result = f(inputs1[i], inputs2[i]);
|
||||
asm("" ::"r"(result));
|
||||
}
|
||||
|
||||
uint64_t stop = gpu::processor_clock();
|
||||
gpu::memory_fence();
|
||||
asm("" ::"r"(stop));
|
||||
volatile auto output = result;
|
||||
|
||||
// Return the time elapsed.
|
||||
return stop - start;
|
||||
}
|
||||
} // namespace LIBC_NAMESPACE_DECL
|
||||
|
||||
#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
|
||||
|
||||
Reference in New Issue
Block a user