Files
clang-p2996/offload/test/offloading/default_thread_limit.c
Johannes Doerfert 3c8efd7928 [OpenMP] Ensure the actual kernel is annotated with launch bounds (#99927)
In debug mode there is a wrapper (the kernel) around the function in
which we generate the kernel code. We worked around this before to get
the correct kernel name, but now we really distinguish both to attach
the launch bounds to the kernel, not the inner function.
2024-07-23 09:02:47 -07:00

101 lines
3.0 KiB
C

// clang-format off
// RUN: %libomptarget-compile-generic
// RUN: env LIBOMPTARGET_INFO=16 \
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
// RUN: %libomptarget-compile-generic -g
// RUN: env LIBOMPTARGET_INFO=16 \
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
// REQUIRES: amdgpu
__attribute__((optnone)) int optnone() { return 1; }
int main() {
int N = optnone() * 4098 * 32;
// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
#pragma omp target
#pragma omp teams distribute parallel for
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: 42 (MaxFlatWorkGroupSize: 1024
#pragma omp target thread_limit(optnone() * 42)
#pragma omp teams distribute parallel for
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
#pragma omp target thread_limit(optnone() * 42) ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
#pragma omp teams distribute parallel for
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
#pragma omp target ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
#pragma omp teams distribute parallel for
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: MaxFlatWorkGroupSize: 1024
#pragma omp target
#pragma omp teams distribute parallel for num_threads(optnone() * 42)
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: MaxFlatWorkGroupSize: 1024
#pragma omp target teams distribute parallel for thread_limit(optnone() * 42)
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: MaxFlatWorkGroupSize: 1024
#pragma omp target teams distribute parallel for num_threads(optnone() * 42)
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: 9 (MaxFlatWorkGroupSize: 9
#pragma omp target
#pragma omp teams distribute parallel for num_threads(9)
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
#pragma omp target thread_limit(4)
#pragma omp teams distribute parallel for
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
#pragma omp target
#pragma omp teams distribute parallel for thread_limit(4)
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: 9 (MaxFlatWorkGroupSize: 9
#pragma omp target teams distribute parallel for num_threads(9)
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
#pragma omp target teams distribute parallel for simd thread_limit(4)
for (int i = 0; i < N; ++i) {
optnone();
}
}