From 66fca0674d83254c70af4a6289496b8acc4377df Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Tue, 1 Apr 2025 10:29:08 +0100 Subject: [PATCH] [OpenMP] Fix num_iters in __kmpc_*_loop DeviceRTL functions (#133435) This patch removes the addition of 1 to the number of iterations when calling the following DeviceRTL functions: - `__kmpc_distribute_for_static_loop*` - `__kmpc_distribute_static_loop*` - `__kmpc_for_static_loop*` Calls to these functions are currently only produced by the OMPIRBuilder from flang, which already passes the correct number of iterations to these functions. By adding 1 to the received `num_iters` variable, worksharing can produce incorrect results. This impacts flang OpenMP offloading of `do`, `distribute` and `distribute parallel do` constructs. Expecting the application to pass `tripcount - 1` as the argument seems unexpected as well, so rather than updating flang I think it makes more sense to update the runtime. --- offload/DeviceRTL/src/Workshare.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index 861b9ca371cc..a8759307b42b 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -911,19 +911,19 @@ public: IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ TY num_threads, TY block_chunk, TY thread_chunk) { \ ompx::StaticLoopChunker::DistributeFor( \ - loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk); \ + loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \ } \ [[gnu::flatten, clang::always_inline]] void \ __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ void *arg, TY num_iters, \ TY block_chunk) { \ - ompx::StaticLoopChunker::Distribute(loc, fn, arg, num_iters + 1, \ + ompx::StaticLoopChunker::Distribute(loc, fn, arg, num_iters, \ block_chunk); \ } \ [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ TY num_threads, TY thread_chunk) { \ - ompx::StaticLoopChunker::For(loc, fn, arg, num_iters + 1, num_threads, \ + ompx::StaticLoopChunker::For(loc, fn, arg, num_iters, num_threads, \ thread_chunk); \ }