The device runtime contains several calls to __kmpc_get_hardware_num_threads_in_block and __kmpc_get_hardware_num_blocks. If the thread_limit and the num_teams are constant, these calls can be folded to the constant value. In commit D106033 we have the optimization phase. This commit adds the attributes to the outlined function for the grid size. the two attributes are `omp_target_num_teams` and `omp_target_thread_limit`. These values are added as long as they are constant. Two functions are created `getNumThreadsExprForTargetDirective` and `getNumTeamsExprForTargetDirective`. The original functions `emitNumTeamsForTargetDirective` and `emitNumThreadsForTargetDirective` identify the expresion and emit the code. However, for the Device version of the outlined function, we cannot emit anything. Therefore, this is a first attempt to separate emision of code from deduction of the values. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D106298
710 lines
49 KiB
C++
710 lines
49 KiB
C++
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
|
|
// Test target codegen - host bc file has to be created first.
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK2
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK3
|
|
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK4
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK5
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK6
|
|
|
|
// expected-no-diagnostics
|
|
#ifndef HEADER
|
|
#define HEADER
|
|
|
|
template<typename tx>
|
|
tx ftemplate(int n) {
|
|
tx a = 0;
|
|
short aa = 0;
|
|
tx b[10];
|
|
|
|
#pragma omp target parallel map(tofrom: aa) num_threads(1024)
|
|
{
|
|
aa += 1;
|
|
}
|
|
|
|
#pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) num_threads(n)
|
|
{
|
|
a += 1;
|
|
aa += 1;
|
|
b[2] += 1;
|
|
}
|
|
|
|
return a;
|
|
}
|
|
|
|
int bar(int n){
|
|
int a = 0;
|
|
|
|
a += ftemplate<int>(n);
|
|
|
|
return a;
|
|
}
|
|
|
|
#endif
|
|
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25
|
|
// CHECK1-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8
|
|
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
|
|
// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true)
|
|
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
|
|
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK1: user_code.entry:
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1)
|
|
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK1-NEXT: ret void
|
|
// CHECK1: worker.exit:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8
|
|
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2
|
|
// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
|
|
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK1-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
|
|
// CHECK1-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
|
|
// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8
|
|
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
|
|
// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
|
|
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8
|
|
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8
|
|
// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
|
|
// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
|
|
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32*
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true)
|
|
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
|
|
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK1: user_code.entry:
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP1]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3)
|
|
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK1-NEXT: ret void
|
|
// CHECK1: worker.exit:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8
|
|
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
|
|
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8
|
|
// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
|
|
// CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK1-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK1-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
|
|
// CHECK1-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2
|
|
// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i64 0, i64 2
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
|
// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1
|
|
// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25
|
|
// CHECK2-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true)
|
|
// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
|
|
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK2: user_code.entry:
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
|
|
// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK2-NEXT: ret void
|
|
// CHECK2: worker.exit:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2
|
|
// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
|
|
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK2-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
|
|
// CHECK2-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
|
|
// CHECK2-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
|
|
// CHECK2-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4
|
|
// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true)
|
|
// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
|
|
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK2: user_code.entry:
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP1]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
|
|
// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK2-NEXT: ret void
|
|
// CHECK2: worker.exit:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
|
|
// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
|
|
// CHECK2-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK2-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK2-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
|
|
// CHECK2-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2
|
|
// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
|
// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1
|
|
// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25
|
|
// CHECK3-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true)
|
|
// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
|
|
// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK3: user_code.entry:
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
|
|
// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK3-NEXT: ret void
|
|
// CHECK3: worker.exit:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2
|
|
// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
|
|
// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK3-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
|
|
// CHECK3-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
|
|
// CHECK3-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
|
|
// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4
|
|
// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true)
|
|
// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
|
|
// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK3: user_code.entry:
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP1]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
|
|
// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK3-NEXT: ret void
|
|
// CHECK3: worker.exit:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
|
|
// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
|
|
// CHECK3-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK3-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
|
|
// CHECK3-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2
|
|
// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
|
// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1
|
|
// CHECK3-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25
|
|
// CHECK4-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8
|
|
// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
|
|
// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true)
|
|
// CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
|
|
// CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK4: user_code.entry:
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1)
|
|
// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4: worker.exit:
|
|
// CHECK4-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8
|
|
// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
|
|
// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2
|
|
// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
|
|
// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK4-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
|
|
// CHECK4-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2
|
|
// CHECK4-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
|
|
// CHECK4-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8
|
|
// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
|
|
// CHECK4-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
|
|
// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8
|
|
// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
|
|
// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8
|
|
// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
|
|
// CHECK4-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
|
|
// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32*
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true)
|
|
// CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
|
|
// CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK4: user_code.entry:
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP1]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3)
|
|
// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4: worker.exit:
|
|
// CHECK4-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8
|
|
// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
|
|
// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
|
|
// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
|
|
// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8
|
|
// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
|
|
// CHECK4-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK4-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK4-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
|
|
// CHECK4-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2
|
|
// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i64 0, i64 2
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
|
// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1
|
|
// CHECK4-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
|
|
// CHECK4-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25
|
|
// CHECK5-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true)
|
|
// CHECK5-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
|
|
// CHECK5-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK5: user_code.entry:
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
|
|
// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5: worker.exit:
|
|
// CHECK5-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2
|
|
// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
|
|
// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK5-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
|
|
// CHECK5-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2
|
|
// CHECK5-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
|
|
// CHECK5-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
|
|
// CHECK5-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4
|
|
// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true)
|
|
// CHECK5-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
|
|
// CHECK5-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK5: user_code.entry:
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP1]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
|
|
// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5: worker.exit:
|
|
// CHECK5-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
|
|
// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
|
|
// CHECK5-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK5-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK5-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
|
|
// CHECK5-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2
|
|
// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
|
// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1
|
|
// CHECK5-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
|
|
// CHECK5-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25
|
|
// CHECK6-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true)
|
|
// CHECK6-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
|
|
// CHECK6-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK6: user_code.entry:
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
|
|
// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6: worker.exit:
|
|
// CHECK6-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2
|
|
// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
|
|
// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK6-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
|
|
// CHECK6-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2
|
|
// CHECK6-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
|
|
// CHECK6-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
|
|
// CHECK6-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4
|
|
// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true)
|
|
// CHECK6-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
|
|
// CHECK6-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK6: user_code.entry:
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP1]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
|
|
// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6: worker.exit:
|
|
// CHECK6-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
|
|
// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4
|
|
// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
|
|
// CHECK6-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK6-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
|
|
// CHECK6-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
|
|
// CHECK6-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2
|
|
// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
|
// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1
|
|
// CHECK6-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
|
|
// CHECK6-NEXT: ret void
|
|
//
|