Files
clang-p2996/llvm/test/Transforms/OpenMP/deduplication_soundness.ll
Matt 88e31f64a0 [OpenMP][FIX] Remove unsound omp_get_thread_limit deduplication (#79524)
The deduplication of the calls to `omp_get_thread_limit` used to be
legal when originally added in
<e28936f613 (diff-de101c82aff66b2bda2d1f53fde3dde7b0d370f14f1ff37b7919ce38531230dfR123)>,
as the result (thread_limit) was immutable.

However, now that we have `thread_limit` clause, we no longer have
immutability; therefore `omp_get_thread_limit()` is not a deduplicable
runtime call.

Thus, removing `omp_get_thread_limit` from the
`DeduplicableRuntimeCallIDs` array.

Here's a simple example:
```
#include <omp.h>
#include <stdio.h>

int main()
{
#pragma omp target thread_limit(4)
{
printf("\n1:target thread_limit: %d\n", omp_get_thread_limit());
}

#pragma omp target thread_limit(3)
{
printf("\n2:target thread_limit: %d\n", omp_get_thread_limit());
}
return 0;
}
```

GCC-compiled binary execution: https://gcc.godbolt.org/z/Pjv3TWoTq
```
1:target thread_limit: 4
2:target thread_limit: 3
```

Clang/LLVM-compiled binary execution:
https://clang.godbolt.org/z/zdPbrdMPn
```
1:target thread_limit: 4
2:target thread_limit: 4
```

By my reading of the OpenMP spec GCC does the right thing here; cf.
<https://www.openmp.org/spec-html/5.2/openmpse12.html#x34-330002.4>:
> If a target construct with a thread_limit clause is encountered, the
thread-limit-var ICV from the data environment of the generated initial
task is instead set to an implementation defined value between one and
the value specified in the clause.

The common subexpression elimination (CSE) of the second call to
`omp_get_thread_limit` by LLVM does not seem to be correct, as it's not
an available expression at any program point(s) (in the scope of the
clause in question) after the second target construct with a
`thread_limit` clause is encountered.

Compiling with `-Rpass=openmp-opt -Rpass-analysis=openmp-opt
-Rpass-missed=openmp-opt` we have:
https://clang.godbolt.org/z/G7dfhP7jh
```
<source>:8:42: remark: OpenMP runtime call omp_get_thread_limit deduplicated. [OMP170] [-Rpass=openmp-opt]
8 | printf("\n1:target thread_limit: %d\n",omp_get_thread_limit());
| ^
```

OMP170 has the following explanation:
https://openmp.llvm.org/remarks/OMP170.html

> This optimization remark indicates that a call to an OpenMP runtime
call was replaced with the result of an existing one. This occurs when
the compiler knows that the result of a runtime call is immutable.
Removing duplicate calls is done by replacing all calls to that function
with the result of the first call. This cannot be done automatically by
the compiler because the implementations of the OpenMP runtime calls
live in a separate library the compiler cannot see.
This optimization will trigger for known OpenMP runtime calls whose
return value will not change.

At the same time I do not believe we have an analysis checking whether
this precondition holds here: "This occurs when the compiler knows that
the result of a runtime call is immutable."

AFAICT, such analysis doesn't appear to exist in the original patch
introducing deduplication, either:

-
9548b74a83
- https://reviews.llvm.org/D69930

The fix is to remove it from `DeduplicableRuntimeCallIDs`, effectively
reverting the addition in this commit (noting that `omp_get_max_threads`
is not present in `DeduplicableRuntimeCallIDs`, so it's possible this
addition was incorrect in the first place):

- [OpenMP][Opt] Annotate known runtime functions and deduplicate more,
-
e28936f613 (diff-de101c82aff66b2bda2d1f53fde3dde7b0d370f14f1ff37b7919ce38531230dfR123)

As a result, we're no longer unsoundly deduplicating the OpenMP runtime
call `omp_get_thread_limit` as illustrated by the test case: Note the
(correctly) repeated `call i32 @omp_get_thread_limit()`.

---------

Co-authored-by: Joseph Huber <huberjn@outlook.com>
2024-02-22 08:13:41 -06:00

60 lines
2.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --scrub-attributes --filter "@omp_get_thread_limit|@use" --version 4
; RUN: opt -passes=openmp-opt-cgscc -S < %s | FileCheck %s
declare void @use(i32 noundef)
declare i32 @omp_get_thread_limit()
declare void @__kmpc_set_thread_limit(ptr, i32, i32)
declare i32 @__kmpc_global_thread_num(ptr)
declare noalias ptr @__kmpc_omp_task_alloc(ptr, i32, i32, i64, i64, ptr)
declare void @__kmpc_omp_task_complete_if0(ptr, i32, ptr)
declare void @__kmpc_omp_task_begin_if0(ptr, i32, ptr)
%struct.ident_t = type { i32, i32, i32, i32, ptr }
@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
define i32 @main() local_unnamed_addr {
; CHECK-LABEL: define i32 @main() local_unnamed_addr {
; CHECK: [[CALL_I_I_I:%.*]] = call i32 @omp_get_thread_limit()
; CHECK: call void @use(i32 noundef [[CALL_I_I_I]])
; CHECK: [[CALL_I_I_I2:%.*]] = call i32 @omp_get_thread_limit()
; CHECK: call void @use(i32 noundef [[CALL_I_I_I2]])
;
entry:
%0 = call i32 @__kmpc_global_thread_num(ptr nonnull @1)
%1 = call ptr @__kmpc_omp_task_alloc(ptr nonnull @1, i32 %0, i32 1, i64 40, i64 0, ptr nonnull @.omp_task_entry.)
call void @__kmpc_omp_task_begin_if0(ptr nonnull @1, i32 %0, ptr %1)
call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 4)
%call.i.i.i = call i32 @omp_get_thread_limit()
call void @use(i32 noundef %call.i.i.i)
call void @__kmpc_omp_task_complete_if0(ptr nonnull @1, i32 %0, ptr %1)
%2 = call ptr @__kmpc_omp_task_alloc(ptr nonnull @1, i32 %0, i32 1, i64 40, i64 0, ptr nonnull @.omp_task_entry..2)
call void @__kmpc_omp_task_begin_if0(ptr nonnull @1, i32 %0, ptr %2)
call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 3)
%call.i.i.i2 = call i32 @omp_get_thread_limit()
call void @use(i32 noundef %call.i.i.i2)
call void @__kmpc_omp_task_complete_if0(ptr nonnull @1, i32 %0, ptr %2)
ret i32 0
}
define internal noundef i32 @.omp_task_entry.(i32 noundef %0, ptr noalias nocapture noundef readonly %1) {
entry:
tail call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 4)
%call.i.i = tail call i32 @omp_get_thread_limit()
tail call void @use(i32 noundef %call.i.i)
ret i32 0
}
define internal noundef i32 @.omp_task_entry..2(i32 noundef %0, ptr noalias nocapture noundef readonly %1) {
entry:
tail call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 3)
%call.i.i = tail call i32 @omp_get_thread_limit()
tail call void @use(i32 noundef %call.i.i)
ret i32 0
}
!llvm.module.flags = !{!0}
!0 = !{i32 7, !"openmp", i32 51}