Files
clang-p2996/clang/test/OpenMP/for_private_reduction_codegen.cpp
CHANDRA GHALE afbcf9529a [OpenMP 6.0 ]Codegen for Reduction over private variables with reduction clause (#134709)
Codegen support for reduction over private variable with reduction
clause. Section 7.6.10 in in OpenMP 6.0 spec.
- An internal shared copy is initialized with an initializer value.
- The shared copy is updated by combining its value with the values from
the private copies created by the clause.
- Once an encountering thread verifies that all updates are complete,
its original list item is updated by merging its value with that of the
shared copy and then broadcast to all threads.

Sample Test Case from OpenMP 6.0 Example 
```
#include <assert.h>
#include <omp.h>
#define N 10

void do_red(int n, int *v, int &sum_v)
{
    sum_v = 0; // sum_v is private
    #pragma omp for reduction(original(private),+: sum_v)
    for (int i = 0; i < n; i++) 
    {
        sum_v += v[i];
    }
}

int main(void)
{
    int v[N];
    for (int i = 0; i < N; i++)
        v[i] = i;
    #pragma omp parallel num_threads(4)
    {
        int s_v; // s_v is private
        do_red(N, v, s_v);
        assert(s_v == 45);
    }
    return 0;
}
```
Expected Codegen:
```
 // A shared global/static variable is introduced for the reduction result.
 // This variable is initialized (e.g., using memset or a UDR initializer)
 // e.g., .omp.reduction.internal_private_var

 // Barrier before any thread performs combination
  call void @__kmpc_barrier(...)

 // Initialization block (executed by thread 0)
 // e.g., call void @llvm.memset.p0.i64(...) or call @udr_initializer(...)

  call void @__kmpc_critical(...)
    // Inside critical section:
    // Load the current value from the shared variable
    // Load the thread-local private variable's value
    // Perform the reduction operation 
    // Store the result back to the shared variable

  call void @__kmpc_end_critical(...)
  // Barrier after all threads complete their combinations

  call void @__kmpc_barrier(...)
 // Broadcast phase:
 // Load the final result from the shared variable)
 // Store the final result to the original private variable in each thread
 // Final barrier after broadcast

  call void @__kmpc_barrier(...)
```

---------

Co-authored-by: Chandra Ghale <ghale@pe31.hpc.amslabs.hpecorp.net>
2025-06-11 14:01:31 +05:30

711 lines
42 KiB
C++

// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --global-value-regex ".omp.reduction..internal[a-zA-Z_0-9.]+"
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s
// expected-no-diagnostics
#define N 10
class Sum {
int val;
public:
Sum(int v = 0) : val(v) {}
Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); }
Sum &operator+=(const Sum &rhs) {
val += rhs.val;
return *this;
}
};
#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in) \
initializer(omp_priv = Sum(0))
void func_red() {
Sum result(0);
Sum array[N];
for (int i = 0; i < N; i++) {
array[i] = Sum(i);
}
#pragma omp parallel private(result) num_threads(4)
{
#pragma omp for reduction(sum_reduction : result)
for (int i = 0; i < N; i++) {
result = result + array[i];
}
}
}
void do_red(int n, int *v, int &sum_v) {
sum_v = 0;
#pragma omp for reduction(original(private), + : sum_v)
for (int i = 0; i < n; i++) {
sum_v += v[i];
}
}
void do_red_extended(int n, int *v, int &sum_v, int &prod_v) {
sum_v = 0;
prod_v = 1;
#pragma omp for reduction(original(private), + : sum_v) \
reduction(original(private), * : prod_v)
for (int i = 0; i < n; i++) {
sum_v += v[i];
prod_v *= v[i];
}
}
int main(void) {
int v[N];
for (int i = 0; i < N; i++)
v[i] = i;
#pragma omp parallel num_threads(4)
{
int s_v;
do_red(N, v, s_v);
}
int sum_v_ext = 0, prod_v_ext = 1;
#pragma omp parallel num_threads(4)
{
do_red_extended(N, v, sum_v_ext, prod_v_ext);
}
return 0;
}
//.
// CHECK: @.omp.reduction..internal_pivate_.result.result_996 = common global %class.Sum zeroinitializer, align 4
// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1188 = common global i32 0, align 4
// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1392 = common global i32 0, align 4
// CHECK: @.omp.reduction..internal_pivate_.prod_v.prod_v_1461 = common global i32 0, align 4
//.
// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv
// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
// CHECK-NEXT: [[ARRAY:%.*]] = alloca [10 x %class.Sum], align 16
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0)
// CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAY_BEGIN]], i64 10
// CHECK-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
// CHECK: arrayctor.loop:
// CHECK-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]], i32 noundef 0)
// CHECK-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAYCTOR_CUR]], i64 1
// CHECK-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
// CHECK-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
// CHECK: arrayctor.cont:
// CHECK-NEXT: store i32 0, ptr [[I]], align 4
// CHECK-NEXT: br label [[FOR_COND:%.*]]
// CHECK: for.cond:
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4
// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
// CHECK: for.body:
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[REF_TMP]], i32 noundef [[TMP2]])
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4
// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
// CHECK-NEXT: br label [[FOR_INC:%.*]]
// CHECK: for.inc:
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
// CHECK: for.end:
// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z8func_redv.omp_outlined, ptr [[ARRAY]])
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC1Ei
// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4
// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4
// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
// CHECK-NEXT: call void @_ZN3SumC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined
// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[ARRAY_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[RESULT1:%.*]] = alloca [[CLASS_SUM]], align 4
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8
// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
// CHECK-NEXT: store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAY_ADDR]], align 8
// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0)
// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
// CHECK-NEXT: call void @.omp_initializer.(ptr noundef [[RESULT1]], ptr noundef [[RESULT]])
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK: cond.true:
// CHECK-NEXT: br label [[COND_END:%.*]]
// CHECK: cond.false:
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: br label [[COND_END]]
// CHECK: cond.end:
// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
// CHECK: omp.inner.for.cond:
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
// CHECK-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
// CHECK: omp.inner.for.body:
// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4
// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX]])
// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0
// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT1]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
// CHECK: omp.body.continue:
// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
// CHECK: omp.inner.for.inc:
// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
// CHECK-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
// CHECK: omp.inner.for.end:
// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
// CHECK: omp.loop.exit:
// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: switch i32 [[TMP11]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
// CHECK-NEXT: ]
// CHECK: .omp.reduction.case1:
// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: .omp.reduction.case2:
// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: .omp.reduction.default:
// CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP2]], 0
// CHECK-NEXT: br i1 [[TMP12]], label [[INIT:%.*]], label [[INIT_END:%.*]]
// CHECK: init:
// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_pivate_.result.result_996, i32 noundef 0)
// CHECK-NEXT: br label [[INIT_END]]
// CHECK: init.end:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result.result_996, ptr noundef [[RESULT1]])
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP13:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result.result_996, align 4
// CHECK-NEXT: store [[CLASS_SUM]] [[TMP13]], ptr [[RESULT1]], align 4
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]])
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4:[0-9]+]], i32 [[TMP2]])
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@.omp_combiner.
// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
// CHECK-NEXT: [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]])
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_ZN3SumpLERKS_
// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8
// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8
// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[TMP0]], i32 0, i32 0
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VAL]], align 4
// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
// CHECK-NEXT: store i32 [[ADD]], ptr [[VAL2]], align 4
// CHECK-NEXT: ret ptr [[THIS1]]
//
//
// CHECK-LABEL: define {{[^@]+}}@.omp_initializer.
// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], i32 noundef 0)
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_ZNK3SumplERKS_
// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8
// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8
// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[TMP1]], i32 0, i32 0
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP2]]
// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RETVAL]], i32 noundef [[ADD]])
// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[RETVAL]], i32 0, i32 0
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4
// CHECK-NEXT: ret i32 [[TMP3]]
//
//
// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined.omp.reduction.reduction_func
// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC2Ei
// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4
// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4
// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[THIS1]], i32 0, i32 0
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
// CHECK-NEXT: store i32 [[TMP0]], ptr [[VAL]], align 4
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi
// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[SUM_V4:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
// CHECK-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
// CHECK-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
// CHECK-NEXT: store i32 0, ptr [[I]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
// CHECK: omp.precond.then:
// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8
// CHECK-NEXT: store i32 0, ptr [[SUM_V4]], align 4
// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[_TMP5]], align 8
// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
// CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
// CHECK-NEXT: br i1 [[CMP7]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK: cond.true:
// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
// CHECK-NEXT: br label [[COND_END:%.*]]
// CHECK: cond.false:
// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: br label [[COND_END]]
// CHECK: cond.end:
// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
// CHECK-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
// CHECK: omp.inner.for.cond:
// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
// CHECK-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
// CHECK: omp.inner.for.body:
// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
// CHECK-NEXT: store i32 [[ADD]], ptr [[I6]], align 4
// CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[I6]], align 4
// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]]
// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8
// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]]
// CHECK-NEXT: store i32 [[ADD9]], ptr [[TMP19]], align 4
// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
// CHECK: omp.body.continue:
// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
// CHECK: omp.inner.for.inc:
// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1
// CHECK-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
// CHECK: omp.inner.for.end:
// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
// CHECK: omp.loop.exit:
// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
// CHECK-NEXT: [[TMP22:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: switch i32 [[TMP22]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
// CHECK-NEXT: ]
// CHECK: .omp.reduction.case1:
// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: .omp.reduction.case2:
// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: .omp.reduction.default:
// CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP0]], 0
// CHECK-NEXT: br i1 [[TMP23]], label [[INIT:%.*]], label [[INIT_END:%.*]]
// CHECK: init:
// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
// CHECK-NEXT: br label [[INIT_END]]
// CHECK: init.end:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4
// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
// CHECK-NEXT: store i32 [[ADD11]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
// CHECK-NEXT: store i32 [[TMP26]], ptr [[SUM_V4]], align 4
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[SUM_V4]], align 4
// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
// CHECK-NEXT: store i32 [[ADD12]], ptr [[TMP7]], align 4
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: br label [[OMP_PRECOND_END]]
// CHECK: omp.precond.end:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]])
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi.omp.reduction.reduction_func
// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_
// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[PROD_V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[PROD_V_ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[SUM_V5:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[_TMP6:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[PROD_V7:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[_TMP8:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[I9:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
// CHECK-NEXT: store ptr [[PROD_V]], ptr [[PROD_V_ADDR]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8
// CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
// CHECK-NEXT: store ptr [[TMP3]], ptr [[TMP]], align 8
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8
// CHECK-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8
// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_ADDR]], align 4
// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
// CHECK-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1
// CHECK-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_3]], align 4
// CHECK-NEXT: store i32 0, ptr [[I]], align 4
// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]]
// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
// CHECK: omp.precond.then:
// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP]], align 8
// CHECK-NEXT: store i32 0, ptr [[SUM_V5]], align 4
// CHECK-NEXT: store ptr [[SUM_V5]], ptr [[_TMP6]], align 8
// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[_TMP1]], align 8
// CHECK-NEXT: store i32 1, ptr [[PROD_V7]], align 4
// CHECK-NEXT: store ptr [[PROD_V7]], ptr [[_TMP8]], align 8
// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
// CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
// CHECK-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK: cond.true:
// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
// CHECK-NEXT: br label [[COND_END:%.*]]
// CHECK: cond.false:
// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: br label [[COND_END]]
// CHECK: cond.end:
// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ]
// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
// CHECK-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
// CHECK: omp.inner.for.cond:
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
// CHECK-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
// CHECK: omp.inner.for.body:
// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
// CHECK-NEXT: store i32 [[ADD]], ptr [[I9]], align 4
// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[V_ADDR]], align 8
// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[I9]], align 4
// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP20]] to i64
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[IDXPROM]]
// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
// CHECK-NEXT: [[TMP22:%.*]] = load ptr, ptr [[_TMP6]], align 8
// CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP23]], [[TMP21]]
// CHECK-NEXT: store i32 [[ADD12]], ptr [[TMP22]], align 4
// CHECK-NEXT: [[TMP24:%.*]] = load ptr, ptr [[V_ADDR]], align 8
// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[I9]], align 4
// CHECK-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP25]] to i64
// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[IDXPROM13]]
// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4
// CHECK-NEXT: [[TMP27:%.*]] = load ptr, ptr [[_TMP8]], align 8
// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
// CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[TMP28]], [[TMP26]]
// CHECK-NEXT: store i32 [[MUL15]], ptr [[TMP27]], align 4
// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
// CHECK: omp.body.continue:
// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
// CHECK: omp.inner.for.inc:
// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP29]], 1
// CHECK-NEXT: store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4
// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
// CHECK: omp.inner.for.end:
// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
// CHECK: omp.loop.exit:
// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
// CHECK-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: switch i32 [[TMP30]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
// CHECK-NEXT: ]
// CHECK: .omp.reduction.case1:
// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: .omp.reduction.case2:
// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: .omp.reduction.default:
// CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP0]], 0
// CHECK-NEXT: br i1 [[TMP31]], label [[INIT:%.*]], label [[INIT_END:%.*]]
// CHECK: init:
// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
// CHECK-NEXT: br label [[INIT_END]]
// CHECK: init.end:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[SUM_V5]], align 4
// CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP32]], [[TMP33]]
// CHECK-NEXT: store i32 [[ADD17]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
// CHECK-NEXT: store i32 [[TMP34]], ptr [[SUM_V5]], align 4
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP9]], align 4
// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[SUM_V5]], align 4
// CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
// CHECK-NEXT: store i32 [[ADD18]], ptr [[TMP9]], align 4
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: [[TMP37:%.*]] = icmp eq i32 [[TMP0]], 0
// CHECK-NEXT: br i1 [[TMP37]], label [[INIT19:%.*]], label [[INIT_END20:%.*]]
// CHECK: init19:
// CHECK-NEXT: store i32 1, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
// CHECK-NEXT: br label [[INIT_END20]]
// CHECK: init.end20:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[PROD_V7]], align 4
// CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[TMP38]], [[TMP39]]
// CHECK-NEXT: store i32 [[MUL21]], ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
// CHECK-NEXT: store i32 [[TMP40]], ptr [[PROD_V7]], align 4
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP10]], align 4
// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[PROD_V7]], align 4
// CHECK-NEXT: [[MUL22:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
// CHECK-NEXT: store i32 [[MUL22]], ptr [[TMP10]], align 4
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: br label [[OMP_PRECOND_END]]
// CHECK: omp.precond.end:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]])
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func
// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@main
// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[SUM_V_EXT:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[PROD_V_EXT:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4
// CHECK-NEXT: store i32 0, ptr [[I]], align 4
// CHECK-NEXT: br label [[FOR_COND:%.*]]
// CHECK: for.cond:
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4
// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
// CHECK: for.body:
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4
// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[V]], i64 0, i64 [[IDXPROM]]
// CHECK-NEXT: store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4
// CHECK-NEXT: br label [[FOR_INC:%.*]]
// CHECK: for.inc:
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
// CHECK: for.end:
// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]])
// CHECK-NEXT: store i32 0, ptr [[SUM_V_EXT]], align 4
// CHECK-NEXT: store i32 1, ptr [[PROD_V_EXT]], align 4
// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.1, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]])
// CHECK-NEXT: ret i32 0