This patch changes the default aligntment from 8 to 16, and encodes this information in the `__kmpc_alloc_shared` runtime call to communicate it to the HeapToStack pass. The previous alignment of 8 was not sufficient for the maximum size of primitive types on 64-bit systems, and needs to be increaesd. This reduces the amount of space availible in the data sharing stack, so this implementation will need to be improved later to include the alignment requirements in the allocation call, and use it properly in the data sharing stack in the runtime. Depends on D115888 Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D115971
8165 lines
565 KiB
C++
8165 lines
565 KiB
C++
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
|
|
// Test target codegen - host bc file has to be created first.
|
|
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
|
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
|
|
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
|
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2
|
|
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3
|
|
// expected-no-diagnostics
|
|
#ifndef HEADER
|
|
#define HEADER
|
|
|
|
template<typename tx>
|
|
tx ftemplate(int n) {
|
|
int a;
|
|
short b;
|
|
tx c;
|
|
float d;
|
|
double e;
|
|
|
|
#pragma omp target
|
|
#pragma omp teams reduction(+: e)
|
|
{
|
|
e += 5;
|
|
}
|
|
|
|
#pragma omp target
|
|
#pragma omp teams reduction(^: c) reduction(*: d)
|
|
{
|
|
c ^= 2;
|
|
d *= 33;
|
|
}
|
|
|
|
#pragma omp target
|
|
#pragma omp teams reduction(|: a) reduction(max: b)
|
|
#pragma omp parallel reduction(|: a) reduction(max: b)
|
|
{
|
|
a |= 1;
|
|
b = 99 > b ? 99 : b;
|
|
}
|
|
|
|
return a+b+c+d+e;
|
|
}
|
|
|
|
int bar(int n){
|
|
int a = 0;
|
|
|
|
a += ftemplate<char>(n);
|
|
|
|
return a;
|
|
}
|
|
|
|
#endif
|
|
// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker
|
|
// CHECK4-SAME: () #[[ATTR0:[0-9]+]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
|
|
// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4
|
|
// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]]
|
|
// CHECK4: .await.work:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
|
|
// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
|
|
// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
|
|
// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
|
|
// CHECK4: .select.workers:
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
|
|
// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
|
|
// CHECK4: .execute.parallel:
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
|
|
// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
|
|
// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
|
|
// CHECK4: .terminate.parallel:
|
|
// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel()
|
|
// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]]
|
|
// CHECK4: .barrier.parallel:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK4-NEXT: br label [[DOTAWAIT_WORK]]
|
|
// CHECK4: .exit:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23
|
|
// CHECK4-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4
|
|
// CHECK4-NEXT: [[E7:%.*]] = alloca double, align 8
|
|
// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK4-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
|
|
// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
|
|
// CHECK4-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
|
|
// CHECK4: .worker:
|
|
// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]]
|
|
// CHECK4-NEXT: br label [[DOTEXIT:%.*]]
|
|
// CHECK4: .mastercheck:
|
|
// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
|
|
// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
|
|
// CHECK4-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
|
|
// CHECK4: .master:
|
|
// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
|
|
// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
|
|
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack()
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK4-NEXT: store double [[TMP7]], double* [[E7]], align 8
|
|
// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
|
|
// CHECK4: .termination.notifier:
|
|
// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1)
|
|
// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK4-NEXT: br label [[DOTEXIT]]
|
|
// CHECK4: .exit:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK4-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4
|
|
// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty*
|
|
// CHECK4-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0
|
|
// CHECK4-NEXT: store double 0.000000e+00, double* [[E1]], align 8
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load double, double* [[E1]], align 8
|
|
// CHECK4-NEXT: [[ADD:%.*]] = fadd double [[TMP6]], 5.000000e+00
|
|
// CHECK4-NEXT: store double [[ADD]], double* [[E1]], align 8
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = bitcast double* [[E1]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1
|
|
// CHECK4-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK4: .omp.reduction.then:
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = load double, double* [[E1]], align 8
|
|
// CHECK4-NEXT: [[ADD2:%.*]] = fadd double [[TMP15]], [[TMP16]]
|
|
// CHECK4-NEXT: store double [[ADD2]], double* [[TMP0]], align 8
|
|
// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]])
|
|
// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK4: .omp.reduction.done:
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
|
|
// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]])
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
|
|
// CHECK4-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8
|
|
// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
|
|
// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
|
|
// CHECK4-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4
|
|
// CHECK4-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK4-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK4-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]]
|
|
// CHECK4-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK4-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK4-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0
|
|
// CHECK4-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]]
|
|
// CHECK4-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK4-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
|
|
// CHECK4-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]]
|
|
// CHECK4-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]]
|
|
// CHECK4-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK4: then:
|
|
// CHECK4-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
|
|
// CHECK4-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK4: else:
|
|
// CHECK4-NEXT: br label [[IFCONT]]
|
|
// CHECK4: ifcont:
|
|
// CHECK4-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK4-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
|
|
// CHECK4-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK4: then4:
|
|
// CHECK4-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4
|
|
// CHECK4-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4
|
|
// CHECK4-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double*
|
|
// CHECK4-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double*
|
|
// CHECK4-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8
|
|
// CHECK4-NEXT: store double [[TMP46]], double* [[TMP45]], align 8
|
|
// CHECK4-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK4: else5:
|
|
// CHECK4-NEXT: br label [[IFCONT6]]
|
|
// CHECK4: ifcont6:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK4-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK4-NEXT: br label [[PRECOND:%.*]]
|
|
// CHECK4: precond:
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2
|
|
// CHECK4-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
|
|
// CHECK4: body:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK4: then:
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK4-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK4-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK4: else:
|
|
// CHECK4-NEXT: br label [[IFCONT]]
|
|
// CHECK4: ifcont:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
|
|
// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK4: then4:
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4
|
|
// CHECK4-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK4: else5:
|
|
// CHECK4-NEXT: br label [[IFCONT6]]
|
|
// CHECK4: ifcont6:
|
|
// CHECK4-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
|
|
// CHECK4-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK4-NEXT: br label [[PRECOND]]
|
|
// CHECK4: exit:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
|
|
// CHECK4-NEXT: store double [[TMP12]], double* [[TMP11]], align 128
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
|
|
// CHECK4-NEXT: store double [[TMP12]], double* [[TMP10]], align 8
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker
|
|
// CHECK4-SAME: () #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
|
|
// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4
|
|
// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]]
|
|
// CHECK4: .await.work:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
|
|
// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
|
|
// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
|
|
// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
|
|
// CHECK4: .select.workers:
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
|
|
// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
|
|
// CHECK4: .execute.parallel:
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
|
|
// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
|
|
// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
|
|
// CHECK4: .terminate.parallel:
|
|
// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel()
|
|
// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]]
|
|
// CHECK4: .barrier.parallel:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK4-NEXT: br label [[DOTAWAIT_WORK]]
|
|
// CHECK4: .exit:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
|
|
// CHECK4-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4
|
|
// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8*
|
|
// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float*
|
|
// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
|
|
// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
|
|
// CHECK4: .worker:
|
|
// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]]
|
|
// CHECK4-NEXT: br label [[DOTEXIT:%.*]]
|
|
// CHECK4: .mastercheck:
|
|
// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
|
|
// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
|
|
// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
|
|
// CHECK4: .master:
|
|
// CHECK4-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
|
|
// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
|
|
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack()
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4
|
|
// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.1*
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 4
|
|
// CHECK4-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 1
|
|
// CHECK4-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 4
|
|
// CHECK4-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 0
|
|
// CHECK4-NEXT: store float [[TMP11]], float* [[D9]], align 4
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK4-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
|
|
// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]])
|
|
// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
|
|
// CHECK4: .termination.notifier:
|
|
// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1)
|
|
// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK4-NEXT: br label [[DOTEXIT]]
|
|
// CHECK4: .exit:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3
|
|
// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4
|
|
// CHECK4-NEXT: store float* [[D]], float** [[D_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 8
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.2*
|
|
// CHECK4-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: store i8 0, i8* [[C1]], align 4
|
|
// CHECK4-NEXT: store float 1.000000e+00, float* [[D2]], align 4
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4
|
|
// CHECK4-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32
|
|
// CHECK4-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2
|
|
// CHECK4-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
|
|
// CHECK4-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4
|
|
// CHECK4-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01
|
|
// CHECK4-NEXT: store float [[MUL]], float* [[D2]], align 4
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: store i8* [[C1]], i8** [[TMP9]], align 4
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 2048, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10)
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1
|
|
// CHECK4-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK4: .omp.reduction.then:
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1
|
|
// CHECK4-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4
|
|
// CHECK4-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32
|
|
// CHECK4-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
|
|
// CHECK4-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
|
|
// CHECK4-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4
|
|
// CHECK4-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]]
|
|
// CHECK4-NEXT: store float [[MUL8]], float* [[TMP1]], align 4
|
|
// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]])
|
|
// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK4: .omp.reduction.done:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]])
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
|
|
// CHECK4-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK4-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
|
|
// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4
|
|
// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float*
|
|
// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1
|
|
// CHECK4-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8*
|
|
// CHECK4-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32*
|
|
// CHECK4-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
|
|
// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
|
|
// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
|
|
// CHECK4-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4
|
|
// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1
|
|
// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
|
|
// CHECK4-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4
|
|
// CHECK4-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK4-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]]
|
|
// CHECK4-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK4-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK4-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0
|
|
// CHECK4-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]]
|
|
// CHECK4-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK4-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]]
|
|
// CHECK4-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]]
|
|
// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]]
|
|
// CHECK4-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK4: then:
|
|
// CHECK4-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK4: else:
|
|
// CHECK4-NEXT: br label [[IFCONT]]
|
|
// CHECK4: ifcont:
|
|
// CHECK4-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK4-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]]
|
|
// CHECK4-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK4: then6:
|
|
// CHECK4-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4
|
|
// CHECK4-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
|
|
// CHECK4-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1
|
|
// CHECK4-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1
|
|
// CHECK4-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
|
|
// CHECK4-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
|
|
// CHECK4-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float*
|
|
// CHECK4-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float*
|
|
// CHECK4-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4
|
|
// CHECK4-NEXT: store float [[TMP62]], float* [[TMP61]], align 4
|
|
// CHECK4-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK4: else7:
|
|
// CHECK4-NEXT: br label [[IFCONT8]]
|
|
// CHECK4: ifcont8:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK4: then:
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)*
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1
|
|
// CHECK4-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1
|
|
// CHECK4-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK4: else:
|
|
// CHECK4-NEXT: br label [[IFCONT]]
|
|
// CHECK4: ifcont:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
|
|
// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK4: then4:
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)*
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1
|
|
// CHECK4-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1
|
|
// CHECK4-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK4: else5:
|
|
// CHECK4-NEXT: br label [[IFCONT6]]
|
|
// CHECK4: ifcont6:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
|
|
// CHECK4: then8:
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32*
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4
|
|
// CHECK4-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4
|
|
// CHECK4-NEXT: br label [[IFCONT10:%.*]]
|
|
// CHECK4: else9:
|
|
// CHECK4-NEXT: br label [[IFCONT10]]
|
|
// CHECK4: ifcont10:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]]
|
|
// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
|
|
// CHECK4: then12:
|
|
// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4
|
|
// CHECK4-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32*
|
|
// CHECK4-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4
|
|
// CHECK4-NEXT: br label [[IFCONT14:%.*]]
|
|
// CHECK4: else13:
|
|
// CHECK4-NEXT: br label [[IFCONT14]]
|
|
// CHECK4: ifcont14:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
|
|
// CHECK4-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
|
|
// CHECK4-NEXT: store float [[TMP16]], float* [[TMP15]], align 128
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
|
|
// CHECK4-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
|
|
// CHECK4-NEXT: store float [[TMP16]], float* [[TMP14]], align 4
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36
|
|
// CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4
|
|
// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16*
|
|
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
|
|
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
|
|
// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]]
|
|
// CHECK4: .execute:
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
|
|
// CHECK4-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK4-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]]
|
|
// CHECK4: .omp.deinit:
|
|
// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
|
|
// CHECK4-NEXT: br label [[DOTEXIT:%.*]]
|
|
// CHECK4: .exit:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11
|
|
// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK4-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK4-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
|
|
// CHECK4-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK4-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i32 2)
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22)
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
|
|
// CHECK4-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK4: .omp.reduction.then:
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK4-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
|
|
// CHECK4-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32
|
|
// CHECK4-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK4-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
|
|
// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
|
|
// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK4: cond.true:
|
|
// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK4-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK4: cond.false:
|
|
// CHECK4-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK4-NEXT: br label [[COND_END]]
|
|
// CHECK4: cond.end:
|
|
// CHECK4-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
|
|
// CHECK4-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2
|
|
// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
|
|
// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK4: .omp.reduction.done:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__12
|
|
// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK4-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK4-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
|
|
// CHECK4-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK4-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK4-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1
|
|
// CHECK4-NEXT: store i32 [[OR]], i32* [[A1]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32
|
|
// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
|
|
// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK4: cond.true:
|
|
// CHECK4-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK4: cond.false:
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK4-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK4-NEXT: br label [[COND_END]]
|
|
// CHECK4: cond.end:
|
|
// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
|
|
// CHECK4-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16
|
|
// CHECK4-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15)
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
|
|
// CHECK4-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK4: .omp.reduction.then:
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK4-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
|
|
// CHECK4-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK4-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK4-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
|
|
// CHECK4-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
|
|
// CHECK4-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
|
|
// CHECK4: cond.true9:
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK4-NEXT: br label [[COND_END11:%.*]]
|
|
// CHECK4: cond.false10:
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK4-NEXT: br label [[COND_END11]]
|
|
// CHECK4: cond.end11:
|
|
// CHECK4-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
|
|
// CHECK4-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2
|
|
// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
|
|
// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK4: .omp.reduction.done:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
|
|
// CHECK4-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4
|
|
// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
|
|
// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
|
|
// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK4-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
|
|
// CHECK4-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
|
|
// CHECK4-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
|
|
// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
|
|
// CHECK4-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK4-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
|
|
// CHECK4-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4
|
|
// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK4-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK4-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
|
|
// CHECK4-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK4-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK4-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
|
|
// CHECK4-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
|
|
// CHECK4-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK4-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
|
|
// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
|
|
// CHECK4-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
|
|
// CHECK4-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK4: then:
|
|
// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK4-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK4-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK4: else:
|
|
// CHECK4-NEXT: br label [[IFCONT]]
|
|
// CHECK4: ifcont:
|
|
// CHECK4-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK4-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
|
|
// CHECK4-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK4: then6:
|
|
// CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
|
|
// CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
|
|
// CHECK4-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
|
|
// CHECK4-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4
|
|
// CHECK4-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
|
|
// CHECK4-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
|
|
// CHECK4-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
|
|
// CHECK4-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK4-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2
|
|
// CHECK4-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK4: else7:
|
|
// CHECK4-NEXT: br label [[IFCONT8]]
|
|
// CHECK4: ifcont8:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK4: then:
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
|
|
// CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
|
|
// CHECK4-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK4: else:
|
|
// CHECK4-NEXT: br label [[IFCONT]]
|
|
// CHECK4: ifcont:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
|
|
// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK4: then4:
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4
|
|
// CHECK4-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK4: else5:
|
|
// CHECK4-NEXT: br label [[IFCONT6]]
|
|
// CHECK4: ifcont6:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
|
|
// CHECK4: then8:
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
|
|
// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
|
|
// CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
|
|
// CHECK4-NEXT: br label [[IFCONT10:%.*]]
|
|
// CHECK4: else9:
|
|
// CHECK4-NEXT: br label [[IFCONT10]]
|
|
// CHECK4: ifcont10:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
|
|
// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
|
|
// CHECK4: then12:
|
|
// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
|
|
// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
|
|
// CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
|
|
// CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2
|
|
// CHECK4-NEXT: br label [[IFCONT14:%.*]]
|
|
// CHECK4: else13:
|
|
// CHECK4-NEXT: br label [[IFCONT14]]
|
|
// CHECK4: ifcont14:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
|
|
// CHECK4-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4
|
|
// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
|
|
// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
|
|
// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK4-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
|
|
// CHECK4-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
|
|
// CHECK4-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
|
|
// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
|
|
// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
|
|
// CHECK4-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK4-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
|
|
// CHECK4-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4
|
|
// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK4-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK4-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
|
|
// CHECK4-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK4-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK4-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
|
|
// CHECK4-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
|
|
// CHECK4-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK4-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
|
|
// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
|
|
// CHECK4-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
|
|
// CHECK4-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK4: then:
|
|
// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK4-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK4: else:
|
|
// CHECK4-NEXT: br label [[IFCONT]]
|
|
// CHECK4: ifcont:
|
|
// CHECK4-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK4-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
|
|
// CHECK4-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK4: then6:
|
|
// CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
|
|
// CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
|
|
// CHECK4-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
|
|
// CHECK4-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4
|
|
// CHECK4-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
|
|
// CHECK4-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
|
|
// CHECK4-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
|
|
// CHECK4-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK4-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2
|
|
// CHECK4-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK4: else7:
|
|
// CHECK4-NEXT: br label [[IFCONT8]]
|
|
// CHECK4: ifcont8:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK4: then:
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
|
|
// CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
|
|
// CHECK4-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK4: else:
|
|
// CHECK4-NEXT: br label [[IFCONT]]
|
|
// CHECK4: ifcont:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
|
|
// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK4: then4:
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4
|
|
// CHECK4-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK4: else5:
|
|
// CHECK4-NEXT: br label [[IFCONT6]]
|
|
// CHECK4: ifcont6:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
|
|
// CHECK4: then8:
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
|
|
// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
|
|
// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
|
|
// CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
|
|
// CHECK4-NEXT: br label [[IFCONT10:%.*]]
|
|
// CHECK4: else9:
|
|
// CHECK4-NEXT: br label [[IFCONT10]]
|
|
// CHECK4: ifcont10:
|
|
// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
|
|
// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
|
|
// CHECK4: then12:
|
|
// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
|
|
// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
|
|
// CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
|
|
// CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2
|
|
// CHECK4-NEXT: br label [[IFCONT14:%.*]]
|
|
// CHECK4: else13:
|
|
// CHECK4-NEXT: br label [[IFCONT14]]
|
|
// CHECK4: ifcont14:
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
|
|
// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
|
|
// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
|
|
// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22
|
|
// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK4-NEXT: entry:
|
|
// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
|
|
// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
|
|
// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
|
|
// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4
|
|
// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK4-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker
|
|
// CHECK5-SAME: () #[[ATTR0:[0-9]+]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
|
|
// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4
|
|
// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]]
|
|
// CHECK5: .await.work:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
|
|
// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
|
|
// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
|
|
// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
|
|
// CHECK5: .select.workers:
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
|
|
// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
|
|
// CHECK5: .execute.parallel:
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
|
|
// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
|
|
// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
|
|
// CHECK5: .terminate.parallel:
|
|
// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel()
|
|
// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]]
|
|
// CHECK5: .barrier.parallel:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK5-NEXT: br label [[DOTAWAIT_WORK]]
|
|
// CHECK5: .exit:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23
|
|
// CHECK5-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4
|
|
// CHECK5-NEXT: [[E7:%.*]] = alloca double, align 8
|
|
// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
|
|
// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
|
|
// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
|
|
// CHECK5: .worker:
|
|
// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]]
|
|
// CHECK5-NEXT: br label [[DOTEXIT:%.*]]
|
|
// CHECK5: .mastercheck:
|
|
// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
|
|
// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
|
|
// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
|
|
// CHECK5: .master:
|
|
// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
|
|
// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
|
|
// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack()
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK5-NEXT: store double [[TMP7]], double* [[E7]], align 8
|
|
// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
|
|
// CHECK5: .termination.notifier:
|
|
// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1)
|
|
// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK5-NEXT: br label [[DOTEXIT]]
|
|
// CHECK5: .exit:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty*
|
|
// CHECK5-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0
|
|
// CHECK5-NEXT: store double 0.000000e+00, double* [[E1]], align 8
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8
|
|
// CHECK5-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00
|
|
// CHECK5-NEXT: store double [[ADD]], double* [[E1]], align 8
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1
|
|
// CHECK5-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK5: .omp.reduction.then:
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8
|
|
// CHECK5-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]]
|
|
// CHECK5-NEXT: store double [[ADD2]], double* [[TMP0]], align 8
|
|
// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
|
|
// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK5: .omp.reduction.done:
|
|
// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]])
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK5-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
|
|
// CHECK5-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8
|
|
// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
|
|
// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
|
|
// CHECK5-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4
|
|
// CHECK5-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK5-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK5-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]]
|
|
// CHECK5-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK5-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK5-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0
|
|
// CHECK5-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]]
|
|
// CHECK5-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK5-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
|
|
// CHECK5-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]]
|
|
// CHECK5-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]]
|
|
// CHECK5-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK5: then:
|
|
// CHECK5-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
|
|
// CHECK5-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK5: else:
|
|
// CHECK5-NEXT: br label [[IFCONT]]
|
|
// CHECK5: ifcont:
|
|
// CHECK5-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK5-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
|
|
// CHECK5-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK5: then4:
|
|
// CHECK5-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4
|
|
// CHECK5-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4
|
|
// CHECK5-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double*
|
|
// CHECK5-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double*
|
|
// CHECK5-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8
|
|
// CHECK5-NEXT: store double [[TMP46]], double* [[TMP45]], align 8
|
|
// CHECK5-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK5: else5:
|
|
// CHECK5-NEXT: br label [[IFCONT6]]
|
|
// CHECK5: ifcont6:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK5-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK5-NEXT: br label [[PRECOND:%.*]]
|
|
// CHECK5: precond:
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2
|
|
// CHECK5-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
|
|
// CHECK5: body:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK5: then:
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK5-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK5-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK5: else:
|
|
// CHECK5-NEXT: br label [[IFCONT]]
|
|
// CHECK5: ifcont:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
|
|
// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK5: then4:
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
|
|
// CHECK5-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4
|
|
// CHECK5-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK5: else5:
|
|
// CHECK5-NEXT: br label [[IFCONT6]]
|
|
// CHECK5: ifcont6:
|
|
// CHECK5-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
|
|
// CHECK5-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK5-NEXT: br label [[PRECOND]]
|
|
// CHECK5: exit:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
|
|
// CHECK5-NEXT: store double [[TMP12]], double* [[TMP11]], align 128
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
|
|
// CHECK5-NEXT: store double [[TMP12]], double* [[TMP10]], align 8
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker
|
|
// CHECK5-SAME: () #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
|
|
// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4
|
|
// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]]
|
|
// CHECK5: .await.work:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
|
|
// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
|
|
// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
|
|
// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
|
|
// CHECK5: .select.workers:
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
|
|
// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
|
|
// CHECK5: .execute.parallel:
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
|
|
// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
|
|
// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
|
|
// CHECK5: .terminate.parallel:
|
|
// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel()
|
|
// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]]
|
|
// CHECK5: .barrier.parallel:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK5-NEXT: br label [[DOTAWAIT_WORK]]
|
|
// CHECK5: .exit:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
|
|
// CHECK5-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4
|
|
// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8*
|
|
// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float*
|
|
// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
|
|
// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
|
|
// CHECK5: .worker:
|
|
// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]]
|
|
// CHECK5-NEXT: br label [[DOTEXIT:%.*]]
|
|
// CHECK5: .mastercheck:
|
|
// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
|
|
// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
|
|
// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
|
|
// CHECK5: .master:
|
|
// CHECK5-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
|
|
// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
|
|
// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack()
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4
|
|
// CHECK5-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
|
|
// CHECK5-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4
|
|
// CHECK5-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
|
|
// CHECK5-NEXT: store float [[TMP8]], float* [[D9]], align 4
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
|
|
// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
|
|
// CHECK5: .termination.notifier:
|
|
// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1)
|
|
// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK5-NEXT: br label [[DOTEXIT]]
|
|
// CHECK5: .exit:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4
|
|
// CHECK5-NEXT: store float* [[D]], float** [[D_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2*
|
|
// CHECK5-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0
|
|
// CHECK5-NEXT: store i8 0, i8* [[C1]], align 4
|
|
// CHECK5-NEXT: store float 1.000000e+00, float* [[D2]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4
|
|
// CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32
|
|
// CHECK5-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2
|
|
// CHECK5-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
|
|
// CHECK5-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4
|
|
// CHECK5-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01
|
|
// CHECK5-NEXT: store float [[MUL]], float* [[D2]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8)
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1
|
|
// CHECK5-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK5: .omp.reduction.then:
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1
|
|
// CHECK5-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4
|
|
// CHECK5-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32
|
|
// CHECK5-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
|
|
// CHECK5-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
|
|
// CHECK5-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4
|
|
// CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]]
|
|
// CHECK5-NEXT: store float [[MUL8]], float* [[TMP1]], align 4
|
|
// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
|
|
// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK5: .omp.reduction.done:
|
|
// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]])
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]])
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
|
|
// CHECK5-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
|
|
// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK5-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
|
|
// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4
|
|
// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float*
|
|
// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1
|
|
// CHECK5-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8*
|
|
// CHECK5-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32*
|
|
// CHECK5-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
|
|
// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
|
|
// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
|
|
// CHECK5-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4
|
|
// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1
|
|
// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
|
|
// CHECK5-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4
|
|
// CHECK5-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK5-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]]
|
|
// CHECK5-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK5-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK5-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0
|
|
// CHECK5-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]]
|
|
// CHECK5-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK5-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]]
|
|
// CHECK5-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]]
|
|
// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]]
|
|
// CHECK5-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK5: then:
|
|
// CHECK5-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK5: else:
|
|
// CHECK5-NEXT: br label [[IFCONT]]
|
|
// CHECK5: ifcont:
|
|
// CHECK5-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK5-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]]
|
|
// CHECK5-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK5: then6:
|
|
// CHECK5-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4
|
|
// CHECK5-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
|
|
// CHECK5-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1
|
|
// CHECK5-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1
|
|
// CHECK5-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
|
|
// CHECK5-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
|
|
// CHECK5-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float*
|
|
// CHECK5-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float*
|
|
// CHECK5-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4
|
|
// CHECK5-NEXT: store float [[TMP62]], float* [[TMP61]], align 4
|
|
// CHECK5-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK5: else7:
|
|
// CHECK5-NEXT: br label [[IFCONT8]]
|
|
// CHECK5: ifcont8:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK5: then:
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)*
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1
|
|
// CHECK5-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1
|
|
// CHECK5-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK5: else:
|
|
// CHECK5-NEXT: br label [[IFCONT]]
|
|
// CHECK5: ifcont:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
|
|
// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK5: then4:
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)*
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1
|
|
// CHECK5-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1
|
|
// CHECK5-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK5: else5:
|
|
// CHECK5-NEXT: br label [[IFCONT6]]
|
|
// CHECK5: ifcont6:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
|
|
// CHECK5: then8:
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32*
|
|
// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4
|
|
// CHECK5-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4
|
|
// CHECK5-NEXT: br label [[IFCONT10:%.*]]
|
|
// CHECK5: else9:
|
|
// CHECK5-NEXT: br label [[IFCONT10]]
|
|
// CHECK5: ifcont10:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]]
|
|
// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
|
|
// CHECK5: then12:
|
|
// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4
|
|
// CHECK5-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32*
|
|
// CHECK5-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4
|
|
// CHECK5-NEXT: br label [[IFCONT14:%.*]]
|
|
// CHECK5: else13:
|
|
// CHECK5-NEXT: br label [[IFCONT14]]
|
|
// CHECK5: ifcont14:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
|
|
// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
|
|
// CHECK5-NEXT: store float [[TMP16]], float* [[TMP15]], align 128
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
|
|
// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
|
|
// CHECK5-NEXT: store float [[TMP16]], float* [[TMP14]], align 4
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36
|
|
// CHECK5-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4
|
|
// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16*
|
|
// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
|
|
// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
|
|
// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]]
|
|
// CHECK5: .execute:
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
|
|
// CHECK5-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK5-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]]
|
|
// CHECK5: .omp.deinit:
|
|
// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
|
|
// CHECK5-NEXT: br label [[DOTEXIT:%.*]]
|
|
// CHECK5: .exit:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9
|
|
// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
|
|
// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2)
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20)
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
|
|
// CHECK5-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK5: .omp.reduction.then:
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
|
|
// CHECK5-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4
|
|
// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32
|
|
// CHECK5-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
|
|
// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
|
|
// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK5: cond.true:
|
|
// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK5-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK5: cond.false:
|
|
// CHECK5-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK5-NEXT: br label [[COND_END]]
|
|
// CHECK5: cond.end:
|
|
// CHECK5-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
|
|
// CHECK5-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2
|
|
// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
|
|
// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK5: .omp.reduction.done:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10
|
|
// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
|
|
// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1
|
|
// CHECK5-NEXT: store i32 [[OR]], i32* [[A1]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32
|
|
// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
|
|
// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK5: cond.true:
|
|
// CHECK5-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK5: cond.false:
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK5-NEXT: br label [[COND_END]]
|
|
// CHECK5: cond.end:
|
|
// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
|
|
// CHECK5-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16
|
|
// CHECK5-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13)
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
|
|
// CHECK5-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK5: .omp.reduction.then:
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK5-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
|
|
// CHECK5-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK5-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
|
|
// CHECK5-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
|
|
// CHECK5-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
|
|
// CHECK5: cond.true9:
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK5-NEXT: br label [[COND_END11:%.*]]
|
|
// CHECK5: cond.false10:
|
|
// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK5-NEXT: br label [[COND_END11]]
|
|
// CHECK5: cond.end11:
|
|
// CHECK5-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
|
|
// CHECK5-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2
|
|
// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
|
|
// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK5: .omp.reduction.done:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
|
|
// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4
|
|
// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
|
|
// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
|
|
// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
|
|
// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
|
|
// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
|
|
// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
|
|
// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
|
|
// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4
|
|
// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
|
|
// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
|
|
// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
|
|
// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
|
|
// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
|
|
// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
|
|
// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK5: then:
|
|
// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK5-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK5: else:
|
|
// CHECK5-NEXT: br label [[IFCONT]]
|
|
// CHECK5: ifcont:
|
|
// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
|
|
// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK5: then6:
|
|
// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
|
|
// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
|
|
// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
|
|
// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4
|
|
// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
|
|
// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
|
|
// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
|
|
// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2
|
|
// CHECK5-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK5: else7:
|
|
// CHECK5-NEXT: br label [[IFCONT8]]
|
|
// CHECK5: ifcont8:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK5: then:
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
|
|
// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
|
|
// CHECK5-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK5: else:
|
|
// CHECK5-NEXT: br label [[IFCONT]]
|
|
// CHECK5: ifcont:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
|
|
// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK5: then4:
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4
|
|
// CHECK5-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK5: else5:
|
|
// CHECK5-NEXT: br label [[IFCONT6]]
|
|
// CHECK5: ifcont6:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
|
|
// CHECK5: then8:
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
|
|
// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
|
|
// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
|
|
// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
|
|
// CHECK5-NEXT: br label [[IFCONT10:%.*]]
|
|
// CHECK5: else9:
|
|
// CHECK5-NEXT: br label [[IFCONT10]]
|
|
// CHECK5: ifcont10:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
|
|
// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
|
|
// CHECK5: then12:
|
|
// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
|
|
// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
|
|
// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
|
|
// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2
|
|
// CHECK5-NEXT: br label [[IFCONT14:%.*]]
|
|
// CHECK5: else13:
|
|
// CHECK5-NEXT: br label [[IFCONT14]]
|
|
// CHECK5: ifcont14:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
|
|
// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4
|
|
// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
|
|
// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
|
|
// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
|
|
// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
|
|
// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
|
|
// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
|
|
// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
|
|
// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
|
|
// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4
|
|
// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
|
|
// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
|
|
// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
|
|
// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
|
|
// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
|
|
// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
|
|
// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK5: then:
|
|
// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK5: else:
|
|
// CHECK5-NEXT: br label [[IFCONT]]
|
|
// CHECK5: ifcont:
|
|
// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
|
|
// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK5: then6:
|
|
// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
|
|
// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
|
|
// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
|
|
// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4
|
|
// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
|
|
// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
|
|
// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
|
|
// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2
|
|
// CHECK5-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK5: else7:
|
|
// CHECK5-NEXT: br label [[IFCONT8]]
|
|
// CHECK5: ifcont8:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK5: then:
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
|
|
// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
|
|
// CHECK5-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK5: else:
|
|
// CHECK5-NEXT: br label [[IFCONT]]
|
|
// CHECK5: ifcont:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
|
|
// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK5: then4:
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4
|
|
// CHECK5-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK5: else5:
|
|
// CHECK5-NEXT: br label [[IFCONT6]]
|
|
// CHECK5: ifcont6:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
|
|
// CHECK5: then8:
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
|
|
// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
|
|
// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
|
|
// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
|
|
// CHECK5-NEXT: br label [[IFCONT10:%.*]]
|
|
// CHECK5: else9:
|
|
// CHECK5-NEXT: br label [[IFCONT10]]
|
|
// CHECK5: ifcont10:
|
|
// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
|
|
// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
|
|
// CHECK5: then12:
|
|
// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
|
|
// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
|
|
// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
|
|
// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2
|
|
// CHECK5-NEXT: br label [[IFCONT14:%.*]]
|
|
// CHECK5: else13:
|
|
// CHECK5-NEXT: br label [[IFCONT14]]
|
|
// CHECK5: ifcont14:
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
|
|
// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
|
|
// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
|
|
// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20
|
|
// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK5-NEXT: entry:
|
|
// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
|
|
// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
|
|
// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
|
|
// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4
|
|
// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK5-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker
|
|
// CHECK6-SAME: () #[[ATTR0:[0-9]+]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
|
|
// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4
|
|
// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]]
|
|
// CHECK6: .await.work:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
|
|
// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
|
|
// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
|
|
// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
|
|
// CHECK6: .select.workers:
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
|
|
// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
|
|
// CHECK6: .execute.parallel:
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
|
|
// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
|
|
// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
|
|
// CHECK6: .terminate.parallel:
|
|
// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel()
|
|
// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]]
|
|
// CHECK6: .barrier.parallel:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK6-NEXT: br label [[DOTAWAIT_WORK]]
|
|
// CHECK6: .exit:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23
|
|
// CHECK6-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4
|
|
// CHECK6-NEXT: [[E7:%.*]] = alloca double, align 8
|
|
// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
|
|
// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
|
|
// CHECK6-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
|
|
// CHECK6: .worker:
|
|
// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]]
|
|
// CHECK6-NEXT: br label [[DOTEXIT:%.*]]
|
|
// CHECK6: .mastercheck:
|
|
// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
|
|
// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
|
|
// CHECK6-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
|
|
// CHECK6: .master:
|
|
// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
|
|
// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
|
|
// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack()
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK6-NEXT: store double [[TMP7]], double* [[E7]], align 8
|
|
// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
|
|
// CHECK6: .termination.notifier:
|
|
// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1)
|
|
// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK6-NEXT: br label [[DOTEXIT]]
|
|
// CHECK6: .exit:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty*
|
|
// CHECK6-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0
|
|
// CHECK6-NEXT: store double 0.000000e+00, double* [[E1]], align 8
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8
|
|
// CHECK6-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00
|
|
// CHECK6-NEXT: store double [[ADD]], double* [[E1]], align 8
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 2048, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1
|
|
// CHECK6-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK6: .omp.reduction.then:
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8
|
|
// CHECK6-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]]
|
|
// CHECK6-NEXT: store double [[ADD2]], double* [[TMP0]], align 8
|
|
// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
|
|
// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK6: .omp.reduction.done:
|
|
// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]])
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK6-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
|
|
// CHECK6-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8
|
|
// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
|
|
// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
|
|
// CHECK6-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4
|
|
// CHECK6-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK6-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK6-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]]
|
|
// CHECK6-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK6-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK6-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0
|
|
// CHECK6-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]]
|
|
// CHECK6-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK6-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
|
|
// CHECK6-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]]
|
|
// CHECK6-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]]
|
|
// CHECK6-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK6: then:
|
|
// CHECK6-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
|
|
// CHECK6-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK6: else:
|
|
// CHECK6-NEXT: br label [[IFCONT]]
|
|
// CHECK6: ifcont:
|
|
// CHECK6-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK6-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
|
|
// CHECK6-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK6: then4:
|
|
// CHECK6-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4
|
|
// CHECK6-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4
|
|
// CHECK6-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double*
|
|
// CHECK6-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double*
|
|
// CHECK6-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8
|
|
// CHECK6-NEXT: store double [[TMP46]], double* [[TMP45]], align 8
|
|
// CHECK6-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK6: else5:
|
|
// CHECK6-NEXT: br label [[IFCONT6]]
|
|
// CHECK6: ifcont6:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK6-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK6-NEXT: br label [[PRECOND:%.*]]
|
|
// CHECK6: precond:
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2
|
|
// CHECK6-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
|
|
// CHECK6: body:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK6: then:
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK6-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK6-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK6: else:
|
|
// CHECK6-NEXT: br label [[IFCONT]]
|
|
// CHECK6: ifcont:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
|
|
// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK6: then4:
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
|
|
// CHECK6-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4
|
|
// CHECK6-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK6: else5:
|
|
// CHECK6-NEXT: br label [[IFCONT6]]
|
|
// CHECK6: ifcont6:
|
|
// CHECK6-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
|
|
// CHECK6-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK6-NEXT: br label [[PRECOND]]
|
|
// CHECK6: exit:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
|
|
// CHECK6-NEXT: store double [[TMP12]], double* [[TMP11]], align 128
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
|
|
// CHECK6-NEXT: store double [[TMP12]], double* [[TMP10]], align 8
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker
|
|
// CHECK6-SAME: () #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
|
|
// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4
|
|
// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]]
|
|
// CHECK6: .await.work:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
|
|
// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
|
|
// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
|
|
// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
|
|
// CHECK6: .select.workers:
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
|
|
// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
|
|
// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
|
|
// CHECK6: .execute.parallel:
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
|
|
// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
|
|
// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
|
|
// CHECK6: .terminate.parallel:
|
|
// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel()
|
|
// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]]
|
|
// CHECK6: .barrier.parallel:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK6-NEXT: br label [[DOTAWAIT_WORK]]
|
|
// CHECK6: .exit:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
|
|
// CHECK6-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4
|
|
// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8*
|
|
// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float*
|
|
// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
|
|
// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
|
|
// CHECK6: .worker:
|
|
// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]]
|
|
// CHECK6-NEXT: br label [[DOTEXIT:%.*]]
|
|
// CHECK6: .mastercheck:
|
|
// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
|
|
// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
|
|
// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
|
|
// CHECK6: .master:
|
|
// CHECK6-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
|
|
// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
|
|
// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack()
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4
|
|
// CHECK6-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
|
|
// CHECK6-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4
|
|
// CHECK6-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
|
|
// CHECK6-NEXT: store float [[TMP8]], float* [[D9]], align 4
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
|
|
// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
|
|
// CHECK6: .termination.notifier:
|
|
// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1)
|
|
// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
|
// CHECK6-NEXT: br label [[DOTEXIT]]
|
|
// CHECK6: .exit:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4
|
|
// CHECK6-NEXT: store float* [[D]], float** [[D_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2*
|
|
// CHECK6-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0
|
|
// CHECK6-NEXT: store i8 0, i8* [[C1]], align 4
|
|
// CHECK6-NEXT: store float 1.000000e+00, float* [[D2]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4
|
|
// CHECK6-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32
|
|
// CHECK6-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2
|
|
// CHECK6-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
|
|
// CHECK6-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4
|
|
// CHECK6-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01
|
|
// CHECK6-NEXT: store float [[MUL]], float* [[D2]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8)
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1
|
|
// CHECK6-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK6: .omp.reduction.then:
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1
|
|
// CHECK6-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4
|
|
// CHECK6-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32
|
|
// CHECK6-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
|
|
// CHECK6-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
|
|
// CHECK6-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4
|
|
// CHECK6-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]]
|
|
// CHECK6-NEXT: store float [[MUL8]], float* [[TMP1]], align 4
|
|
// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
|
|
// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK6: .omp.reduction.done:
|
|
// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]])
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]])
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
|
|
// CHECK6-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
|
|
// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK6-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
|
|
// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4
|
|
// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float*
|
|
// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1
|
|
// CHECK6-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8*
|
|
// CHECK6-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32*
|
|
// CHECK6-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
|
|
// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
|
|
// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
|
|
// CHECK6-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4
|
|
// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1
|
|
// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
|
|
// CHECK6-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4
|
|
// CHECK6-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK6-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]]
|
|
// CHECK6-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK6-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK6-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0
|
|
// CHECK6-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]]
|
|
// CHECK6-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK6-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]]
|
|
// CHECK6-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]]
|
|
// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]]
|
|
// CHECK6-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK6: then:
|
|
// CHECK6-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK6: else:
|
|
// CHECK6-NEXT: br label [[IFCONT]]
|
|
// CHECK6: ifcont:
|
|
// CHECK6-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK6-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]]
|
|
// CHECK6-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK6: then6:
|
|
// CHECK6-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4
|
|
// CHECK6-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
|
|
// CHECK6-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1
|
|
// CHECK6-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1
|
|
// CHECK6-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
|
|
// CHECK6-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
|
|
// CHECK6-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float*
|
|
// CHECK6-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float*
|
|
// CHECK6-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4
|
|
// CHECK6-NEXT: store float [[TMP62]], float* [[TMP61]], align 4
|
|
// CHECK6-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK6: else7:
|
|
// CHECK6-NEXT: br label [[IFCONT8]]
|
|
// CHECK6: ifcont8:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK6: then:
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)*
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1
|
|
// CHECK6-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1
|
|
// CHECK6-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK6: else:
|
|
// CHECK6-NEXT: br label [[IFCONT]]
|
|
// CHECK6: ifcont:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
|
|
// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK6: then4:
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)*
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1
|
|
// CHECK6-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1
|
|
// CHECK6-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK6: else5:
|
|
// CHECK6-NEXT: br label [[IFCONT6]]
|
|
// CHECK6: ifcont6:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
|
|
// CHECK6: then8:
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32*
|
|
// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4
|
|
// CHECK6-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4
|
|
// CHECK6-NEXT: br label [[IFCONT10:%.*]]
|
|
// CHECK6: else9:
|
|
// CHECK6-NEXT: br label [[IFCONT10]]
|
|
// CHECK6: ifcont10:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]]
|
|
// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
|
|
// CHECK6: then12:
|
|
// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4
|
|
// CHECK6-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32*
|
|
// CHECK6-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4
|
|
// CHECK6-NEXT: br label [[IFCONT14:%.*]]
|
|
// CHECK6: else13:
|
|
// CHECK6-NEXT: br label [[IFCONT14]]
|
|
// CHECK6: ifcont14:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
|
|
// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
|
|
// CHECK6-NEXT: store float [[TMP16]], float* [[TMP15]], align 128
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
|
|
// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
|
|
// CHECK6-NEXT: store float [[TMP16]], float* [[TMP14]], align 4
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36
|
|
// CHECK6-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4
|
|
// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16*
|
|
// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
|
|
// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
|
|
// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]]
|
|
// CHECK6: .execute:
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
|
|
// CHECK6-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK6-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]]
|
|
// CHECK6: .omp.deinit:
|
|
// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
|
|
// CHECK6-NEXT: br label [[DOTEXIT:%.*]]
|
|
// CHECK6: .exit:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9
|
|
// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
|
|
// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2)
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20)
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
|
|
// CHECK6-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK6: .omp.reduction.then:
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
|
|
// CHECK6-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4
|
|
// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32
|
|
// CHECK6-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
|
|
// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
|
|
// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK6: cond.true:
|
|
// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK6-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK6: cond.false:
|
|
// CHECK6-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK6-NEXT: br label [[COND_END]]
|
|
// CHECK6: cond.end:
|
|
// CHECK6-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
|
|
// CHECK6-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2
|
|
// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
|
|
// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK6: .omp.reduction.done:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10
|
|
// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
|
|
// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1
|
|
// CHECK6-NEXT: store i32 [[OR]], i32* [[A1]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32
|
|
// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
|
|
// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK6: cond.true:
|
|
// CHECK6-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK6: cond.false:
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK6-NEXT: br label [[COND_END]]
|
|
// CHECK6: cond.end:
|
|
// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
|
|
// CHECK6-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16
|
|
// CHECK6-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13)
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
|
|
// CHECK6-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK6: .omp.reduction.then:
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK6-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
|
|
// CHECK6-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK6-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK6-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
|
|
// CHECK6-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
|
|
// CHECK6-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
|
|
// CHECK6: cond.true9:
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK6-NEXT: br label [[COND_END11:%.*]]
|
|
// CHECK6: cond.false10:
|
|
// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK6-NEXT: br label [[COND_END11]]
|
|
// CHECK6: cond.end11:
|
|
// CHECK6-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
|
|
// CHECK6-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2
|
|
// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
|
|
// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK6: .omp.reduction.done:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
|
|
// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4
|
|
// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
|
|
// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
|
|
// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
|
|
// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
|
|
// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
|
|
// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
|
|
// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
|
|
// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4
|
|
// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
|
|
// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
|
|
// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
|
|
// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
|
|
// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
|
|
// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
|
|
// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK6: then:
|
|
// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK6-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK6: else:
|
|
// CHECK6-NEXT: br label [[IFCONT]]
|
|
// CHECK6: ifcont:
|
|
// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
|
|
// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK6: then6:
|
|
// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
|
|
// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
|
|
// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
|
|
// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4
|
|
// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
|
|
// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
|
|
// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
|
|
// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2
|
|
// CHECK6-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK6: else7:
|
|
// CHECK6-NEXT: br label [[IFCONT8]]
|
|
// CHECK6: ifcont8:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK6: then:
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
|
|
// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
|
|
// CHECK6-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK6: else:
|
|
// CHECK6-NEXT: br label [[IFCONT]]
|
|
// CHECK6: ifcont:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
|
|
// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK6: then4:
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4
|
|
// CHECK6-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK6: else5:
|
|
// CHECK6-NEXT: br label [[IFCONT6]]
|
|
// CHECK6: ifcont6:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
|
|
// CHECK6: then8:
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
|
|
// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
|
|
// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
|
|
// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
|
|
// CHECK6-NEXT: br label [[IFCONT10:%.*]]
|
|
// CHECK6: else9:
|
|
// CHECK6-NEXT: br label [[IFCONT10]]
|
|
// CHECK6: ifcont10:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
|
|
// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
|
|
// CHECK6: then12:
|
|
// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
|
|
// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
|
|
// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
|
|
// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2
|
|
// CHECK6-NEXT: br label [[IFCONT14:%.*]]
|
|
// CHECK6: else13:
|
|
// CHECK6-NEXT: br label [[IFCONT14]]
|
|
// CHECK6: ifcont14:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
|
|
// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4
|
|
// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
|
|
// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
|
|
// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
|
|
// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
|
|
// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
|
|
// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
|
|
// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
|
|
// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
|
|
// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
|
|
// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4
|
|
// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
|
|
// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
|
|
// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
|
|
// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
|
|
// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
|
|
// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
|
|
// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK6: then:
|
|
// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK6: else:
|
|
// CHECK6-NEXT: br label [[IFCONT]]
|
|
// CHECK6: ifcont:
|
|
// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
|
|
// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK6: then6:
|
|
// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
|
|
// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
|
|
// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
|
|
// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4
|
|
// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
|
|
// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
|
|
// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
|
|
// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2
|
|
// CHECK6-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK6: else7:
|
|
// CHECK6-NEXT: br label [[IFCONT8]]
|
|
// CHECK6: ifcont8:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
|
// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK6: then:
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
|
|
// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
|
|
// CHECK6-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK6: else:
|
|
// CHECK6-NEXT: br label [[IFCONT]]
|
|
// CHECK6: ifcont:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
|
|
// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK6: then4:
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4
|
|
// CHECK6-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK6: else5:
|
|
// CHECK6-NEXT: br label [[IFCONT6]]
|
|
// CHECK6: ifcont6:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
|
|
// CHECK6: then8:
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
|
|
// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
|
|
// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
|
|
// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
|
|
// CHECK6-NEXT: br label [[IFCONT10:%.*]]
|
|
// CHECK6: else9:
|
|
// CHECK6-NEXT: br label [[IFCONT10]]
|
|
// CHECK6: ifcont10:
|
|
// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
|
|
// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
|
|
// CHECK6: then12:
|
|
// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
|
// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
|
|
// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
|
|
// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
|
|
// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2
|
|
// CHECK6-NEXT: br label [[IFCONT14:%.*]]
|
|
// CHECK6: else13:
|
|
// CHECK6-NEXT: br label [[IFCONT14]]
|
|
// CHECK6: ifcont14:
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
|
|
// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
|
|
// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
|
|
// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20
|
|
// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
|
|
// CHECK6-NEXT: entry:
|
|
// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
|
|
// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
|
|
// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
|
|
// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4
|
|
// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK6-NEXT: ret void
|
|
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
|
|
// CHECK1-SAME: (i64 [[E:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8
|
|
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8
|
|
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double*
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true)
|
|
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK1: user_code.entry:
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[CONV]], align 8
|
|
// CHECK1-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
|
|
// CHECK1-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
|
|
// CHECK1-NEXT: store double [[TMP1]], double* [[E_ON_STACK]], align 8
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E_ON_STACK]]) #[[ATTR3:[0-9]+]]
|
|
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[E1]], i64 8)
|
|
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
|
|
// CHECK1-NEXT: ret void
|
|
// CHECK1: worker.exit:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store double* [[E]], double** [[E_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8
|
|
// CHECK1-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
|
|
// CHECK1-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
|
|
// CHECK1-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
|
|
// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
|
|
// CHECK1-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 1024, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
|
|
// CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK1: .omp.reduction.then:
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8
|
|
// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]]
|
|
// CHECK1-NEXT: store double [[ADD2]], double* [[TMP0]], align 8
|
|
// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
|
|
// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK1: .omp.reduction.done:
|
|
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[E1]], i64 8)
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
|
|
// CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK1-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
|
|
// CHECK1-NEXT: [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
|
|
// CHECK1-NEXT: store i64 [[TMP20]], i64* [[TMP16]], align 8
|
|
// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
|
|
// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
|
|
// CHECK1-NEXT: [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP11]], align 8
|
|
// CHECK1-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK1-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK1-NEXT: [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
|
|
// CHECK1-NEXT: [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK1-NEXT: [[TMP29:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK1-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
|
|
// CHECK1-NEXT: [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
|
|
// CHECK1-NEXT: [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK1-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
|
|
// CHECK1-NEXT: [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
|
|
// CHECK1-NEXT: [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
|
|
// CHECK1-NEXT: br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK1: then:
|
|
// CHECK1-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
|
|
// CHECK1-NEXT: [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK1: else:
|
|
// CHECK1-NEXT: br label [[IFCONT]]
|
|
// CHECK1: ifcont:
|
|
// CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK1-NEXT: [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
|
|
// CHECK1-NEXT: br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK1: then4:
|
|
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8
|
|
// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8
|
|
// CHECK1-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double*
|
|
// CHECK1-NEXT: [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double*
|
|
// CHECK1-NEXT: [[TMP47:%.*]] = load double, double* [[TMP45]], align 8
|
|
// CHECK1-NEXT: store double [[TMP47]], double* [[TMP46]], align 8
|
|
// CHECK1-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK1: else5:
|
|
// CHECK1-NEXT: br label [[IFCONT6]]
|
|
// CHECK1: ifcont6:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
|
|
// CHECK1-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK1-NEXT: br label [[PRECOND:%.*]]
|
|
// CHECK1: precond:
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
|
|
// CHECK1-NEXT: br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
|
|
// CHECK1: body:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK1: then:
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 8
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
|
|
// CHECK1-NEXT: store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK1-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK1: else:
|
|
// CHECK1-NEXT: br label [[IFCONT]]
|
|
// CHECK1: ifcont:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
|
|
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK1: then2:
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 8
|
|
// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
|
|
// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
|
|
// CHECK1-NEXT: [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4
|
|
// CHECK1-NEXT: store i32 [[TMP22]], i32* [[TMP21]], align 4
|
|
// CHECK1-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK1: else3:
|
|
// CHECK1-NEXT: br label [[IFCONT4]]
|
|
// CHECK1: ifcont4:
|
|
// CHECK1-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
|
|
// CHECK1-NEXT: store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK1-NEXT: br label [[PRECOND]]
|
|
// CHECK1: exit:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
|
|
// CHECK1-NEXT: store double [[TMP12]], double* [[TMP11]], align 128
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
|
|
// CHECK1-NEXT: store double [[TMP12]], double* [[TMP10]], align 8
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
|
|
// CHECK1-SAME: (i64 [[C:%.*]], i64 [[D:%.*]]) #[[ATTR0]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8
|
|
// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8
|
|
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: store i64 [[C]], i64* [[C_ADDR]], align 8
|
|
// CHECK1-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8
|
|
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8*
|
|
// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float*
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true)
|
|
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK1: user_code.entry:
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
|
|
// CHECK1-NEXT: [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 1)
|
|
// CHECK1-NEXT: store i8 [[TMP1]], i8* [[C2]], align 1
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
|
|
// CHECK1-NEXT: [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
|
|
// CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
|
|
// CHECK1-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK1-NEXT: store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[D3]], i64 4)
|
|
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[C2]], i64 1)
|
|
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
|
|
// CHECK1-NEXT: ret void
|
|
// CHECK1: worker.exit:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8
|
|
// CHECK1-NEXT: store float* [[D]], float** [[D_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8
|
|
// CHECK1-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 1)
|
|
// CHECK1-NEXT: [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
|
|
// CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
|
|
// CHECK1-NEXT: store i8 0, i8* [[C1]], align 1
|
|
// CHECK1-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1
|
|
// CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32
|
|
// CHECK1-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2
|
|
// CHECK1-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
|
|
// CHECK1-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4
|
|
// CHECK1-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
|
|
// CHECK1-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: store i8* [[C1]], i8** [[TMP6]], align 8
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8)
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1
|
|
// CHECK1-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK1: .omp.reduction.then:
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1
|
|
// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1
|
|
// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32
|
|
// CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
|
|
// CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
|
|
// CHECK1-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4
|
|
// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]]
|
|
// CHECK1-NEXT: store float [[MUL8]], float* [[TMP1]], align 4
|
|
// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
|
|
// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK1: .omp.reduction.done:
|
|
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[D2]], i64 4)
|
|
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[C1]], i64 1)
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]])
|
|
// CHECK1-NEXT: [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
|
|
// CHECK1-NEXT: store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
|
|
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1
|
|
// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
|
|
// CHECK1-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8
|
|
// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8
|
|
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float*
|
|
// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i64 1
|
|
// CHECK1-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
|
|
// CHECK1-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32*
|
|
// CHECK1-NEXT: [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
|
|
// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
|
|
// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK1-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
|
|
// CHECK1-NEXT: store i32 [[TMP32]], i32* [[TMP28]], align 4
|
|
// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1
|
|
// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i64 1
|
|
// CHECK1-NEXT: [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP35]], i8** [[TMP23]], align 8
|
|
// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK1-NEXT: [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
|
|
// CHECK1-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK1-NEXT: [[TMP41:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK1-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0
|
|
// CHECK1-NEXT: [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]]
|
|
// CHECK1-NEXT: [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK1-NEXT: [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]]
|
|
// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]]
|
|
// CHECK1-NEXT: [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]]
|
|
// CHECK1-NEXT: br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK1: then:
|
|
// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK1-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK1: else:
|
|
// CHECK1-NEXT: br label [[IFCONT]]
|
|
// CHECK1: ifcont:
|
|
// CHECK1-NEXT: [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK1-NEXT: [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]]
|
|
// CHECK1-NEXT: br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
|
|
// CHECK1: then5:
|
|
// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8
|
|
// CHECK1-NEXT: [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 8
|
|
// CHECK1-NEXT: [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
|
|
// CHECK1-NEXT: store i8 [[TMP57]], i8* [[TMP56]], align 1
|
|
// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8
|
|
// CHECK1-NEXT: [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 8
|
|
// CHECK1-NEXT: [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float*
|
|
// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float*
|
|
// CHECK1-NEXT: [[TMP64:%.*]] = load float, float* [[TMP62]], align 4
|
|
// CHECK1-NEXT: store float [[TMP64]], float* [[TMP63]], align 4
|
|
// CHECK1-NEXT: br label [[IFCONT7:%.*]]
|
|
// CHECK1: else6:
|
|
// CHECK1-NEXT: br label [[IFCONT7]]
|
|
// CHECK1: ifcont7:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK1: then:
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)*
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1
|
|
// CHECK1-NEXT: store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1
|
|
// CHECK1-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK1: else:
|
|
// CHECK1-NEXT: br label [[IFCONT]]
|
|
// CHECK1: ifcont:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
|
|
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK1: then2:
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)*
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8
|
|
// CHECK1-NEXT: [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1
|
|
// CHECK1-NEXT: store i8 [[TMP18]], i8* [[TMP17]], align 1
|
|
// CHECK1-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK1: else3:
|
|
// CHECK1-NEXT: br label [[IFCONT4]]
|
|
// CHECK1: ifcont4:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK1-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK1: then6:
|
|
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8
|
|
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32*
|
|
// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
|
|
// CHECK1-NEXT: store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4
|
|
// CHECK1-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK1: else7:
|
|
// CHECK1-NEXT: br label [[IFCONT8]]
|
|
// CHECK1: ifcont8:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]]
|
|
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
|
|
// CHECK1: then10:
|
|
// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 8
|
|
// CHECK1-NEXT: [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32*
|
|
// CHECK1-NEXT: [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4
|
|
// CHECK1-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 4
|
|
// CHECK1-NEXT: br label [[IFCONT12:%.*]]
|
|
// CHECK1: else11:
|
|
// CHECK1-NEXT: br label [[IFCONT12]]
|
|
// CHECK1: ifcont12:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
|
|
// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
|
|
// CHECK1-NEXT: store float [[TMP16]], float* [[TMP15]], align 128
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
|
|
// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
|
|
// CHECK1-NEXT: store float [[TMP16]], float* [[TMP14]], align 4
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
|
|
// CHECK1-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
|
|
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
|
|
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8
|
|
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
|
|
// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16*
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true)
|
|
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK1: user_code.entry:
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
|
|
// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK1-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
|
|
// CHECK1-NEXT: ret void
|
|
// CHECK1: worker.exit:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__9
|
|
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8
|
|
// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8
|
|
// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i64 2)
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20)
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
|
|
// CHECK1-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK1: .omp.reduction.then:
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
|
|
// CHECK1-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4
|
|
// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32
|
|
// CHECK1-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
|
|
// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
|
|
// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK1: cond.true:
|
|
// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK1-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK1: cond.false:
|
|
// CHECK1-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK1-NEXT: br label [[COND_END]]
|
|
// CHECK1: cond.end:
|
|
// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
|
|
// CHECK1-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2
|
|
// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
|
|
// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK1: .omp.reduction.done:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__10
|
|
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8
|
|
// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8
|
|
// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1
|
|
// CHECK1-NEXT: store i32 [[OR]], i32* [[A1]], align 4
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32
|
|
// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
|
|
// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK1: cond.true:
|
|
// CHECK1-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK1: cond.false:
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK1-NEXT: br label [[COND_END]]
|
|
// CHECK1: cond.end:
|
|
// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
|
|
// CHECK1-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16
|
|
// CHECK1-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13)
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
|
|
// CHECK1-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK1: .omp.reduction.then:
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK1-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
|
|
// CHECK1-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK1-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
|
|
// CHECK1-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
|
|
// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
|
|
// CHECK1: cond.true9:
|
|
// CHECK1-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK1-NEXT: br label [[COND_END11:%.*]]
|
|
// CHECK1: cond.false10:
|
|
// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK1-NEXT: br label [[COND_END11]]
|
|
// CHECK1: cond.end11:
|
|
// CHECK1-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
|
|
// CHECK1-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2
|
|
// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
|
|
// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK1: .omp.reduction.done:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
|
|
// CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
|
|
// CHECK1-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
|
|
// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
|
|
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 8
|
|
// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 8
|
|
// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
|
|
// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
|
|
// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
|
|
// CHECK1-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
|
|
// CHECK1-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
|
|
// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK1-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
|
|
// CHECK1-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
|
|
// CHECK1-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
|
|
// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1
|
|
// CHECK1-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 8
|
|
// CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK1-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
|
|
// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK1-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK1-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
|
|
// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
|
|
// CHECK1-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK1-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
|
|
// CHECK1-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
|
|
// CHECK1-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
|
|
// CHECK1-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK1: then:
|
|
// CHECK1-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK1-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK1-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK1: else:
|
|
// CHECK1-NEXT: br label [[IFCONT]]
|
|
// CHECK1: ifcont:
|
|
// CHECK1-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK1-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
|
|
// CHECK1-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
|
|
// CHECK1: then5:
|
|
// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8
|
|
// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8
|
|
// CHECK1-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK1-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
|
|
// CHECK1-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
|
|
// CHECK1-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4
|
|
// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8
|
|
// CHECK1-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 8
|
|
// CHECK1-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK1-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
|
|
// CHECK1-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2
|
|
// CHECK1-NEXT: br label [[IFCONT7:%.*]]
|
|
// CHECK1: else6:
|
|
// CHECK1-NEXT: br label [[IFCONT7]]
|
|
// CHECK1: ifcont7:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK1: then:
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK1-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK1-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK1: else:
|
|
// CHECK1-NEXT: br label [[IFCONT]]
|
|
// CHECK1: ifcont:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
|
|
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK1: then2:
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
|
|
// CHECK1-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK1-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4
|
|
// CHECK1-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK1: else3:
|
|
// CHECK1-NEXT: br label [[IFCONT4]]
|
|
// CHECK1: ifcont4:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK1-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK1: then6:
|
|
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8
|
|
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
|
|
// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK1-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
|
|
// CHECK1-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
|
|
// CHECK1-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
|
|
// CHECK1-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK1: else7:
|
|
// CHECK1-NEXT: br label [[IFCONT8]]
|
|
// CHECK1: ifcont8:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
|
|
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
|
|
// CHECK1: then10:
|
|
// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
|
|
// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 8
|
|
// CHECK1-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
|
|
// CHECK1-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2
|
|
// CHECK1-NEXT: br label [[IFCONT12:%.*]]
|
|
// CHECK1: else11:
|
|
// CHECK1-NEXT: br label [[IFCONT12]]
|
|
// CHECK1: ifcont12:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
|
|
// CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
|
|
// CHECK1-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
|
|
// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
|
|
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 8
|
|
// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 8
|
|
// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
|
|
// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
|
|
// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
|
|
// CHECK1-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
|
|
// CHECK1-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
|
|
// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK1-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
|
|
// CHECK1-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
|
|
// CHECK1-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
|
|
// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1
|
|
// CHECK1-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 8
|
|
// CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK1-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
|
|
// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK1-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK1-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
|
|
// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
|
|
// CHECK1-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK1-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
|
|
// CHECK1-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
|
|
// CHECK1-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
|
|
// CHECK1-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK1: then:
|
|
// CHECK1-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK1-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK1: else:
|
|
// CHECK1-NEXT: br label [[IFCONT]]
|
|
// CHECK1: ifcont:
|
|
// CHECK1-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK1-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
|
|
// CHECK1-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
|
|
// CHECK1: then5:
|
|
// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8
|
|
// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8
|
|
// CHECK1-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK1-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
|
|
// CHECK1-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
|
|
// CHECK1-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4
|
|
// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8
|
|
// CHECK1-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 8
|
|
// CHECK1-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK1-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
|
|
// CHECK1-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2
|
|
// CHECK1-NEXT: br label [[IFCONT7:%.*]]
|
|
// CHECK1: else6:
|
|
// CHECK1-NEXT: br label [[IFCONT7]]
|
|
// CHECK1: ifcont7:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK1: then:
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK1-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK1-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK1: else:
|
|
// CHECK1-NEXT: br label [[IFCONT]]
|
|
// CHECK1: ifcont:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
|
|
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK1: then2:
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
|
|
// CHECK1-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK1-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4
|
|
// CHECK1-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK1: else3:
|
|
// CHECK1-NEXT: br label [[IFCONT4]]
|
|
// CHECK1: ifcont4:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK1-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK1: then6:
|
|
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8
|
|
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
|
|
// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK1-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
|
|
// CHECK1-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
|
|
// CHECK1-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
|
|
// CHECK1-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK1: else7:
|
|
// CHECK1-NEXT: br label [[IFCONT8]]
|
|
// CHECK1: ifcont8:
|
|
// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
|
|
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
|
|
// CHECK1: then10:
|
|
// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
|
|
// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 8
|
|
// CHECK1-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
|
|
// CHECK1-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2
|
|
// CHECK1-NEXT: br label [[IFCONT12:%.*]]
|
|
// CHECK1: else11:
|
|
// CHECK1-NEXT: br label [[IFCONT12]]
|
|
// CHECK1: ifcont12:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
|
|
// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
|
|
// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8
|
|
// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
|
|
// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
|
|
// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20
|
|
// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8
|
|
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
|
|
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
|
|
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
|
|
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8
|
|
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
|
|
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
|
|
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8
|
|
// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
|
|
// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
|
|
// CHECK2-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4
|
|
// CHECK2-NEXT: [[E1:%.*]] = alloca double, align 8
|
|
// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true)
|
|
// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
|
|
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK2: user_code.entry:
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK2-NEXT: store double [[TMP3]], double* [[E1]], align 8
|
|
// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E1]]) #[[ATTR3:[0-9]+]]
|
|
// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
|
|
// CHECK2-NEXT: ret void
|
|
// CHECK2: worker.exit:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
|
|
// CHECK2-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 8)
|
|
// CHECK2-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
|
|
// CHECK2-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
|
|
// CHECK2-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
|
|
// CHECK2-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 1024, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
|
|
// CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK2: .omp.reduction.then:
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8
|
|
// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]]
|
|
// CHECK2-NEXT: store double [[ADD2]], double* [[TMP0]], align 8
|
|
// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
|
|
// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK2: .omp.reduction.done:
|
|
// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[E1]], i32 8)
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
|
|
// CHECK2-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK2-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
|
|
// CHECK2-NEXT: [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
|
|
// CHECK2-NEXT: store i64 [[TMP20]], i64* [[TMP16]], align 8
|
|
// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
|
|
// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
|
|
// CHECK2-NEXT: [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP11]], align 4
|
|
// CHECK2-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK2-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK2-NEXT: [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
|
|
// CHECK2-NEXT: [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK2-NEXT: [[TMP29:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK2-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
|
|
// CHECK2-NEXT: [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
|
|
// CHECK2-NEXT: [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK2-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
|
|
// CHECK2-NEXT: [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
|
|
// CHECK2-NEXT: [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
|
|
// CHECK2-NEXT: br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK2: then:
|
|
// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
|
|
// CHECK2-NEXT: [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK2: else:
|
|
// CHECK2-NEXT: br label [[IFCONT]]
|
|
// CHECK2: ifcont:
|
|
// CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK2-NEXT: [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
|
|
// CHECK2-NEXT: br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK2: then4:
|
|
// CHECK2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 4
|
|
// CHECK2-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 4
|
|
// CHECK2-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double*
|
|
// CHECK2-NEXT: [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double*
|
|
// CHECK2-NEXT: [[TMP47:%.*]] = load double, double* [[TMP45]], align 8
|
|
// CHECK2-NEXT: store double [[TMP47]], double* [[TMP46]], align 8
|
|
// CHECK2-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK2: else5:
|
|
// CHECK2-NEXT: br label [[IFCONT6]]
|
|
// CHECK2: ifcont6:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
|
|
// CHECK2-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK2-NEXT: br label [[PRECOND:%.*]]
|
|
// CHECK2: precond:
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
|
|
// CHECK2-NEXT: br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
|
|
// CHECK2: body:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK2: then:
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 4
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
|
|
// CHECK2-NEXT: store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK2-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK2: else:
|
|
// CHECK2-NEXT: br label [[IFCONT]]
|
|
// CHECK2: ifcont:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
|
|
// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK2: then2:
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 4
|
|
// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
|
|
// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
|
|
// CHECK2-NEXT: [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP22]], i32* [[TMP21]], align 4
|
|
// CHECK2-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK2: else3:
|
|
// CHECK2-NEXT: br label [[IFCONT4]]
|
|
// CHECK2: ifcont4:
|
|
// CHECK2-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
|
|
// CHECK2-NEXT: store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK2-NEXT: br label [[PRECOND]]
|
|
// CHECK2: exit:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
|
|
// CHECK2-NEXT: store double [[TMP12]], double* [[TMP11]], align 128
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
|
|
// CHECK2-NEXT: store double [[TMP12]], double* [[TMP10]], align 8
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
|
|
// CHECK2-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4
|
|
// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8*
|
|
// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float*
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true)
|
|
// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK2: user_code.entry:
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
|
|
// CHECK2-NEXT: [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
|
|
// CHECK2-NEXT: store i8 [[TMP1]], i8* [[C2]], align 1
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
|
|
// CHECK2-NEXT: [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
|
|
// CHECK2-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
|
|
// CHECK2-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[D3]], i32 4)
|
|
// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[C2]], i32 1)
|
|
// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
|
|
// CHECK2-NEXT: ret void
|
|
// CHECK2: worker.exit:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4
|
|
// CHECK2-NEXT: store float* [[D]], float** [[D_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
|
|
// CHECK2-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
|
|
// CHECK2-NEXT: [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
|
|
// CHECK2-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
|
|
// CHECK2-NEXT: store i8 0, i8* [[C1]], align 1
|
|
// CHECK2-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1
|
|
// CHECK2-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32
|
|
// CHECK2-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2
|
|
// CHECK2-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
|
|
// CHECK2-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4
|
|
// CHECK2-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
|
|
// CHECK2-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8)
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1
|
|
// CHECK2-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK2: .omp.reduction.then:
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1
|
|
// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1
|
|
// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32
|
|
// CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
|
|
// CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
|
|
// CHECK2-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4
|
|
// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]]
|
|
// CHECK2-NEXT: store float [[MUL8]], float* [[TMP1]], align 4
|
|
// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
|
|
// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK2: .omp.reduction.done:
|
|
// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[D2]], i32 4)
|
|
// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[C1]], i32 1)
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]])
|
|
// CHECK2-NEXT: [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
|
|
// CHECK2-NEXT: store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
|
|
// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
|
|
// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK2-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
|
|
// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
|
|
// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float*
|
|
// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i32 1
|
|
// CHECK2-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
|
|
// CHECK2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32*
|
|
// CHECK2-NEXT: [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
|
|
// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
|
|
// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK2-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
|
|
// CHECK2-NEXT: store i32 [[TMP32]], i32* [[TMP28]], align 4
|
|
// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
|
|
// CHECK2-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i32 1
|
|
// CHECK2-NEXT: [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP35]], i8** [[TMP23]], align 4
|
|
// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK2-NEXT: [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
|
|
// CHECK2-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK2-NEXT: [[TMP41:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK2-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0
|
|
// CHECK2-NEXT: [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]]
|
|
// CHECK2-NEXT: [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK2-NEXT: [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]]
|
|
// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]]
|
|
// CHECK2-NEXT: [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]]
|
|
// CHECK2-NEXT: br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK2: then:
|
|
// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK2-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK2: else:
|
|
// CHECK2-NEXT: br label [[IFCONT]]
|
|
// CHECK2: ifcont:
|
|
// CHECK2-NEXT: [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK2-NEXT: [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]]
|
|
// CHECK2-NEXT: br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
|
|
// CHECK2: then5:
|
|
// CHECK2-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
|
|
// CHECK2-NEXT: [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 4
|
|
// CHECK2-NEXT: [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
|
|
// CHECK2-NEXT: store i8 [[TMP57]], i8* [[TMP56]], align 1
|
|
// CHECK2-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
|
|
// CHECK2-NEXT: [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 4
|
|
// CHECK2-NEXT: [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float*
|
|
// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float*
|
|
// CHECK2-NEXT: [[TMP64:%.*]] = load float, float* [[TMP62]], align 4
|
|
// CHECK2-NEXT: store float [[TMP64]], float* [[TMP63]], align 4
|
|
// CHECK2-NEXT: br label [[IFCONT7:%.*]]
|
|
// CHECK2: else6:
|
|
// CHECK2-NEXT: br label [[IFCONT7]]
|
|
// CHECK2: ifcont7:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK2: then:
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)*
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1
|
|
// CHECK2-NEXT: store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1
|
|
// CHECK2-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK2: else:
|
|
// CHECK2-NEXT: br label [[IFCONT]]
|
|
// CHECK2: ifcont:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
|
|
// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK2: then2:
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)*
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK2-NEXT: [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1
|
|
// CHECK2-NEXT: store i8 [[TMP18]], i8* [[TMP17]], align 1
|
|
// CHECK2-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK2: else3:
|
|
// CHECK2-NEXT: br label [[IFCONT4]]
|
|
// CHECK2: ifcont4:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK2-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK2: then6:
|
|
// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
|
|
// CHECK2-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32*
|
|
// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
|
|
// CHECK2-NEXT: store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4
|
|
// CHECK2-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK2: else7:
|
|
// CHECK2-NEXT: br label [[IFCONT8]]
|
|
// CHECK2: ifcont8:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]]
|
|
// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
|
|
// CHECK2: then10:
|
|
// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 4
|
|
// CHECK2-NEXT: [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32*
|
|
// CHECK2-NEXT: [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 4
|
|
// CHECK2-NEXT: br label [[IFCONT12:%.*]]
|
|
// CHECK2: else11:
|
|
// CHECK2-NEXT: br label [[IFCONT12]]
|
|
// CHECK2: ifcont12:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
|
|
// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
|
|
// CHECK2-NEXT: store float [[TMP16]], float* [[TMP15]], align 128
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
|
|
// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
|
|
// CHECK2-NEXT: store float [[TMP16]], float* [[TMP14]], align 4
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
|
|
// CHECK2-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4
|
|
// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16*
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true)
|
|
// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK2: user_code.entry:
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
|
|
// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK2-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
|
|
// CHECK2-NEXT: ret void
|
|
// CHECK2: worker.exit:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9
|
|
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
|
|
// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2)
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20)
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
|
|
// CHECK2-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK2: .omp.reduction.then:
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
|
|
// CHECK2-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4
|
|
// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32
|
|
// CHECK2-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
|
|
// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
|
|
// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK2: cond.true:
|
|
// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK2-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK2: cond.false:
|
|
// CHECK2-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK2-NEXT: br label [[COND_END]]
|
|
// CHECK2: cond.end:
|
|
// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
|
|
// CHECK2-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2
|
|
// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
|
|
// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK2: .omp.reduction.done:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10
|
|
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
|
|
// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1
|
|
// CHECK2-NEXT: store i32 [[OR]], i32* [[A1]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32
|
|
// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
|
|
// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK2: cond.true:
|
|
// CHECK2-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK2: cond.false:
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK2-NEXT: br label [[COND_END]]
|
|
// CHECK2: cond.end:
|
|
// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
|
|
// CHECK2-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16
|
|
// CHECK2-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13)
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
|
|
// CHECK2-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK2: .omp.reduction.then:
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK2-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
|
|
// CHECK2-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK2-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
|
|
// CHECK2-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
|
|
// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
|
|
// CHECK2: cond.true9:
|
|
// CHECK2-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK2-NEXT: br label [[COND_END11:%.*]]
|
|
// CHECK2: cond.false10:
|
|
// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK2-NEXT: br label [[COND_END11]]
|
|
// CHECK2: cond.end11:
|
|
// CHECK2-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
|
|
// CHECK2-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2
|
|
// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
|
|
// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK2: .omp.reduction.done:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
|
|
// CHECK2-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
|
|
// CHECK2-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK2-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 4
|
|
// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
|
|
// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
|
|
// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
|
|
// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
|
|
// CHECK2-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
|
|
// CHECK2-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
|
|
// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK2-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
|
|
// CHECK2-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
|
|
// CHECK2-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK2-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
|
|
// CHECK2-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
|
|
// CHECK2-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 4
|
|
// CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK2-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
|
|
// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK2-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK2-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
|
|
// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
|
|
// CHECK2-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK2-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
|
|
// CHECK2-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
|
|
// CHECK2-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
|
|
// CHECK2-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK2: then:
|
|
// CHECK2-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK2-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK2-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK2: else:
|
|
// CHECK2-NEXT: br label [[IFCONT]]
|
|
// CHECK2: ifcont:
|
|
// CHECK2-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK2-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
|
|
// CHECK2-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
|
|
// CHECK2: then5:
|
|
// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
|
|
// CHECK2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
|
|
// CHECK2-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK2-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
|
|
// CHECK2-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4
|
|
// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
|
|
// CHECK2-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
|
|
// CHECK2-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK2-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
|
|
// CHECK2-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2
|
|
// CHECK2-NEXT: br label [[IFCONT7:%.*]]
|
|
// CHECK2: else6:
|
|
// CHECK2-NEXT: br label [[IFCONT7]]
|
|
// CHECK2: ifcont7:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK2: then:
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK2-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK2-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK2: else:
|
|
// CHECK2-NEXT: br label [[IFCONT]]
|
|
// CHECK2: ifcont:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
|
|
// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK2: then2:
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
|
|
// CHECK2-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4
|
|
// CHECK2-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK2: else3:
|
|
// CHECK2-NEXT: br label [[IFCONT4]]
|
|
// CHECK2: ifcont4:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK2-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK2: then6:
|
|
// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
|
|
// CHECK2-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
|
|
// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK2-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
|
|
// CHECK2-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
|
|
// CHECK2-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
|
|
// CHECK2-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK2: else7:
|
|
// CHECK2-NEXT: br label [[IFCONT8]]
|
|
// CHECK2: ifcont8:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
|
|
// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
|
|
// CHECK2: then10:
|
|
// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
|
|
// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4
|
|
// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
|
|
// CHECK2-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2
|
|
// CHECK2-NEXT: br label [[IFCONT12:%.*]]
|
|
// CHECK2: else11:
|
|
// CHECK2-NEXT: br label [[IFCONT12]]
|
|
// CHECK2: ifcont12:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
|
|
// CHECK2-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
|
|
// CHECK2-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK2-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 4
|
|
// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
|
|
// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
|
|
// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
|
|
// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
|
|
// CHECK2-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
|
|
// CHECK2-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
|
|
// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK2-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
|
|
// CHECK2-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
|
|
// CHECK2-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK2-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
|
|
// CHECK2-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
|
|
// CHECK2-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 4
|
|
// CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK2-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
|
|
// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK2-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK2-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
|
|
// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
|
|
// CHECK2-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK2-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
|
|
// CHECK2-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
|
|
// CHECK2-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
|
|
// CHECK2-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK2: then:
|
|
// CHECK2-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK2-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK2: else:
|
|
// CHECK2-NEXT: br label [[IFCONT]]
|
|
// CHECK2: ifcont:
|
|
// CHECK2-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK2-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
|
|
// CHECK2-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
|
|
// CHECK2: then5:
|
|
// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
|
|
// CHECK2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
|
|
// CHECK2-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK2-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
|
|
// CHECK2-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4
|
|
// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
|
|
// CHECK2-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
|
|
// CHECK2-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK2-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
|
|
// CHECK2-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2
|
|
// CHECK2-NEXT: br label [[IFCONT7:%.*]]
|
|
// CHECK2: else6:
|
|
// CHECK2-NEXT: br label [[IFCONT7]]
|
|
// CHECK2: ifcont7:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK2: then:
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK2-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK2-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK2: else:
|
|
// CHECK2-NEXT: br label [[IFCONT]]
|
|
// CHECK2: ifcont:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
|
|
// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK2: then2:
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
|
|
// CHECK2-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4
|
|
// CHECK2-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK2: else3:
|
|
// CHECK2-NEXT: br label [[IFCONT4]]
|
|
// CHECK2: ifcont4:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK2-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK2: then6:
|
|
// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
|
|
// CHECK2-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
|
|
// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK2-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
|
|
// CHECK2-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
|
|
// CHECK2-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
|
|
// CHECK2-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK2: else7:
|
|
// CHECK2-NEXT: br label [[IFCONT8]]
|
|
// CHECK2: ifcont8:
|
|
// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
|
|
// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
|
|
// CHECK2: then10:
|
|
// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
|
|
// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4
|
|
// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
|
|
// CHECK2-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2
|
|
// CHECK2-NEXT: br label [[IFCONT12:%.*]]
|
|
// CHECK2: else11:
|
|
// CHECK2-NEXT: br label [[IFCONT12]]
|
|
// CHECK2: ifcont12:
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
|
|
// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
|
|
// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
|
|
// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20
|
|
// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK2-NEXT: entry:
|
|
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
|
|
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
|
|
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
|
|
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4
|
|
// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK2-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
|
|
// CHECK3-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4
|
|
// CHECK3-NEXT: [[E1:%.*]] = alloca double, align 8
|
|
// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true)
|
|
// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
|
|
// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK3: user_code.entry:
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK3-NEXT: store double [[TMP3]], double* [[E1]], align 8
|
|
// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E1]]) #[[ATTR3:[0-9]+]]
|
|
// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
|
|
// CHECK3-NEXT: ret void
|
|
// CHECK3: worker.exit:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
|
|
// CHECK3-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 8)
|
|
// CHECK3-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
|
|
// CHECK3-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
|
|
// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
|
|
// CHECK3-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 2048, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
|
|
// CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK3: .omp.reduction.then:
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8
|
|
// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]]
|
|
// CHECK3-NEXT: store double [[ADD2]], double* [[TMP0]], align 8
|
|
// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
|
|
// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK3: .omp.reduction.done:
|
|
// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[E1]], i32 8)
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
|
|
// CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK3-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
|
|
// CHECK3-NEXT: [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
|
|
// CHECK3-NEXT: store i64 [[TMP20]], i64* [[TMP16]], align 8
|
|
// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
|
|
// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
|
|
// CHECK3-NEXT: [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP11]], align 4
|
|
// CHECK3-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK3-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK3-NEXT: [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
|
|
// CHECK3-NEXT: [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK3-NEXT: [[TMP29:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK3-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
|
|
// CHECK3-NEXT: [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
|
|
// CHECK3-NEXT: [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK3-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
|
|
// CHECK3-NEXT: [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
|
|
// CHECK3-NEXT: [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
|
|
// CHECK3-NEXT: br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK3: then:
|
|
// CHECK3-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
|
|
// CHECK3-NEXT: [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK3: else:
|
|
// CHECK3-NEXT: br label [[IFCONT]]
|
|
// CHECK3: ifcont:
|
|
// CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK3-NEXT: [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
|
|
// CHECK3-NEXT: br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
|
// CHECK3: then4:
|
|
// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 4
|
|
// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 4
|
|
// CHECK3-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double*
|
|
// CHECK3-NEXT: [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double*
|
|
// CHECK3-NEXT: [[TMP47:%.*]] = load double, double* [[TMP45]], align 8
|
|
// CHECK3-NEXT: store double [[TMP47]], double* [[TMP46]], align 8
|
|
// CHECK3-NEXT: br label [[IFCONT6:%.*]]
|
|
// CHECK3: else5:
|
|
// CHECK3-NEXT: br label [[IFCONT6]]
|
|
// CHECK3: ifcont6:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
|
|
// CHECK3-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK3-NEXT: br label [[PRECOND:%.*]]
|
|
// CHECK3: precond:
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
|
|
// CHECK3-NEXT: br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
|
|
// CHECK3: body:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK3: then:
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 4
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
|
|
// CHECK3-NEXT: store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK3-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK3: else:
|
|
// CHECK3-NEXT: br label [[IFCONT]]
|
|
// CHECK3: ifcont:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
|
|
// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK3: then2:
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 4
|
|
// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
|
|
// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
|
|
// CHECK3-NEXT: [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP22]], i32* [[TMP21]], align 4
|
|
// CHECK3-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK3: else3:
|
|
// CHECK3-NEXT: br label [[IFCONT4]]
|
|
// CHECK3: ifcont4:
|
|
// CHECK3-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
|
|
// CHECK3-NEXT: store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4
|
|
// CHECK3-NEXT: br label [[PRECOND]]
|
|
// CHECK3: exit:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
|
|
// CHECK3-NEXT: store double [[TMP12]], double* [[TMP11]], align 128
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
|
|
// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
|
|
// CHECK3-NEXT: store double [[TMP12]], double* [[TMP10]], align 8
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
|
|
// CHECK3-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4
|
|
// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8*
|
|
// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float*
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true)
|
|
// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK3: user_code.entry:
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
|
|
// CHECK3-NEXT: [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
|
|
// CHECK3-NEXT: store i8 [[TMP1]], i8* [[C2]], align 1
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
|
|
// CHECK3-NEXT: [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
|
|
// CHECK3-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
|
|
// CHECK3-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[D3]], i32 4)
|
|
// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[C2]], i32 1)
|
|
// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
|
|
// CHECK3-NEXT: ret void
|
|
// CHECK3: worker.exit:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1
|
|
// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4
|
|
// CHECK3-NEXT: store float* [[D]], float** [[D_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
|
|
// CHECK3-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
|
|
// CHECK3-NEXT: [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
|
|
// CHECK3-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
|
|
// CHECK3-NEXT: store i8 0, i8* [[C1]], align 1
|
|
// CHECK3-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1
|
|
// CHECK3-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32
|
|
// CHECK3-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2
|
|
// CHECK3-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
|
|
// CHECK3-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4
|
|
// CHECK3-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
|
|
// CHECK3-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 2048, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8)
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1
|
|
// CHECK3-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK3: .omp.reduction.then:
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1
|
|
// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1
|
|
// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32
|
|
// CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
|
|
// CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
|
|
// CHECK3-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4
|
|
// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]]
|
|
// CHECK3-NEXT: store float [[MUL8]], float* [[TMP1]], align 4
|
|
// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
|
|
// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK3: .omp.reduction.done:
|
|
// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[D2]], i32 4)
|
|
// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[C1]], i32 1)
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]])
|
|
// CHECK3-NEXT: [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
|
|
// CHECK3-NEXT: store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
|
|
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
|
|
// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK3-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
|
|
// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
|
|
// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float*
|
|
// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i32 1
|
|
// CHECK3-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
|
|
// CHECK3-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32*
|
|
// CHECK3-NEXT: [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
|
|
// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
|
|
// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK3-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
|
|
// CHECK3-NEXT: store i32 [[TMP32]], i32* [[TMP28]], align 4
|
|
// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
|
|
// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i32 1
|
|
// CHECK3-NEXT: [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP35]], i8** [[TMP23]], align 4
|
|
// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK3-NEXT: [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
|
|
// CHECK3-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK3-NEXT: [[TMP41:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK3-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0
|
|
// CHECK3-NEXT: [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]]
|
|
// CHECK3-NEXT: [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK3-NEXT: [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]]
|
|
// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]]
|
|
// CHECK3-NEXT: [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]]
|
|
// CHECK3-NEXT: br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK3: then:
|
|
// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK3-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK3: else:
|
|
// CHECK3-NEXT: br label [[IFCONT]]
|
|
// CHECK3: ifcont:
|
|
// CHECK3-NEXT: [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK3-NEXT: [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]]
|
|
// CHECK3-NEXT: br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
|
|
// CHECK3: then5:
|
|
// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
|
|
// CHECK3-NEXT: [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 4
|
|
// CHECK3-NEXT: [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
|
|
// CHECK3-NEXT: store i8 [[TMP57]], i8* [[TMP56]], align 1
|
|
// CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
|
|
// CHECK3-NEXT: [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 4
|
|
// CHECK3-NEXT: [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float*
|
|
// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float*
|
|
// CHECK3-NEXT: [[TMP64:%.*]] = load float, float* [[TMP62]], align 4
|
|
// CHECK3-NEXT: store float [[TMP64]], float* [[TMP63]], align 4
|
|
// CHECK3-NEXT: br label [[IFCONT7:%.*]]
|
|
// CHECK3: else6:
|
|
// CHECK3-NEXT: br label [[IFCONT7]]
|
|
// CHECK3: ifcont7:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK3: then:
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)*
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1
|
|
// CHECK3-NEXT: store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1
|
|
// CHECK3-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK3: else:
|
|
// CHECK3-NEXT: br label [[IFCONT]]
|
|
// CHECK3: ifcont:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
|
|
// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK3: then2:
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)*
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
|
|
// CHECK3-NEXT: [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1
|
|
// CHECK3-NEXT: store i8 [[TMP18]], i8* [[TMP17]], align 1
|
|
// CHECK3-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK3: else3:
|
|
// CHECK3-NEXT: br label [[IFCONT4]]
|
|
// CHECK3: ifcont4:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK3-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK3: then6:
|
|
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
|
|
// CHECK3-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32*
|
|
// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
|
|
// CHECK3-NEXT: store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4
|
|
// CHECK3-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK3: else7:
|
|
// CHECK3-NEXT: br label [[IFCONT8]]
|
|
// CHECK3: ifcont8:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]]
|
|
// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
|
|
// CHECK3: then10:
|
|
// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 4
|
|
// CHECK3-NEXT: [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32*
|
|
// CHECK3-NEXT: [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 4
|
|
// CHECK3-NEXT: br label [[IFCONT12:%.*]]
|
|
// CHECK3: else11:
|
|
// CHECK3-NEXT: br label [[IFCONT12]]
|
|
// CHECK3: ifcont12:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
|
|
// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
|
|
// CHECK3-NEXT: store float [[TMP16]], float* [[TMP15]], align 128
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
|
|
// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
|
|
// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
|
|
// CHECK3-NEXT: store float [[TMP16]], float* [[TMP14]], align 4
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
|
|
// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
|
|
// CHECK3-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4
|
|
// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16*
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true)
|
|
// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
// CHECK3: user_code.entry:
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
|
|
// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
|
|
// CHECK3-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
|
|
// CHECK3-NEXT: ret void
|
|
// CHECK3: worker.exit:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__9
|
|
// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
|
|
// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2)
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20)
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
|
|
// CHECK3-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK3: .omp.reduction.then:
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
|
|
// CHECK3-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4
|
|
// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32
|
|
// CHECK3-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
|
|
// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
|
|
// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK3: cond.true:
|
|
// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK3-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK3: cond.false:
|
|
// CHECK3-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK3-NEXT: br label [[COND_END]]
|
|
// CHECK3: cond.end:
|
|
// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
|
|
// CHECK3-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2
|
|
// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
|
|
// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK3: .omp.reduction.done:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__10
|
|
// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4
|
|
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4
|
|
// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
|
// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
|
|
// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4
|
|
// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1
|
|
// CHECK3-NEXT: store i32 [[OR]], i32* [[A1]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32
|
|
// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
|
|
// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK3: cond.true:
|
|
// CHECK3-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK3: cond.false:
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
|
|
// CHECK3-NEXT: br label [[COND_END]]
|
|
// CHECK3: cond.end:
|
|
// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
|
|
// CHECK3-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16
|
|
// CHECK3-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13)
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
|
|
// CHECK3-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
|
|
// CHECK3: .omp.reduction.then:
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
|
|
// CHECK3-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
|
|
// CHECK3-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK3-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
|
|
// CHECK3-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
|
|
// CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
|
|
// CHECK3: cond.true9:
|
|
// CHECK3-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
|
|
// CHECK3-NEXT: br label [[COND_END11:%.*]]
|
|
// CHECK3: cond.false10:
|
|
// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
|
|
// CHECK3-NEXT: br label [[COND_END11]]
|
|
// CHECK3: cond.end11:
|
|
// CHECK3-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
|
|
// CHECK3-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2
|
|
// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
|
|
// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
|
|
// CHECK3: .omp.reduction.done:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
|
|
// CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
|
|
// CHECK3-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK3-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 4
|
|
// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
|
|
// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
|
|
// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
|
|
// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
|
|
// CHECK3-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
|
|
// CHECK3-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
|
|
// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK3-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
|
|
// CHECK3-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
|
|
// CHECK3-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
|
|
// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
|
|
// CHECK3-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 4
|
|
// CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK3-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
|
|
// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK3-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK3-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
|
|
// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
|
|
// CHECK3-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK3-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
|
|
// CHECK3-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
|
|
// CHECK3-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
|
|
// CHECK3-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK3: then:
|
|
// CHECK3-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK3-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK3-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK3: else:
|
|
// CHECK3-NEXT: br label [[IFCONT]]
|
|
// CHECK3: ifcont:
|
|
// CHECK3-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK3-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
|
|
// CHECK3-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
|
|
// CHECK3: then5:
|
|
// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
|
|
// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
|
|
// CHECK3-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK3-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
|
|
// CHECK3-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4
|
|
// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
|
|
// CHECK3-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
|
|
// CHECK3-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK3-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
|
|
// CHECK3-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2
|
|
// CHECK3-NEXT: br label [[IFCONT7:%.*]]
|
|
// CHECK3: else6:
|
|
// CHECK3-NEXT: br label [[IFCONT7]]
|
|
// CHECK3: ifcont7:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK3: then:
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK3-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK3-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK3: else:
|
|
// CHECK3-NEXT: br label [[IFCONT]]
|
|
// CHECK3: ifcont:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
|
|
// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK3: then2:
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
|
|
// CHECK3-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4
|
|
// CHECK3-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK3: else3:
|
|
// CHECK3-NEXT: br label [[IFCONT4]]
|
|
// CHECK3: ifcont4:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK3-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK3: then6:
|
|
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
|
|
// CHECK3-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
|
|
// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK3-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
|
|
// CHECK3-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
|
|
// CHECK3-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
|
|
// CHECK3-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK3: else7:
|
|
// CHECK3-NEXT: br label [[IFCONT8]]
|
|
// CHECK3: ifcont8:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
|
|
// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
|
|
// CHECK3: then10:
|
|
// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
|
|
// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4
|
|
// CHECK3-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
|
|
// CHECK3-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2
|
|
// CHECK3-NEXT: br label [[IFCONT12:%.*]]
|
|
// CHECK3: else11:
|
|
// CHECK3-NEXT: br label [[IFCONT12]]
|
|
// CHECK3: ifcont12:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
|
|
// CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
|
|
// CHECK3-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
|
|
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
|
|
// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
|
|
// CHECK3-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 4
|
|
// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
|
|
// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
|
|
// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
|
|
// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
|
|
// CHECK3-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
|
|
// CHECK3-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
|
|
// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
|
|
// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
|
|
// CHECK3-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
|
|
// CHECK3-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
|
|
// CHECK3-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
|
|
// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
|
|
// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
|
|
// CHECK3-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 4
|
|
// CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
|
|
// CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK3-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
|
|
// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
|
|
// CHECK3-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1
|
|
// CHECK3-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
|
|
// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
|
|
// CHECK3-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
|
|
// CHECK3-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
|
|
// CHECK3-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
|
|
// CHECK3-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
|
|
// CHECK3-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK3: then:
|
|
// CHECK3-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
|
|
// CHECK3-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
|
|
// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK3: else:
|
|
// CHECK3-NEXT: br label [[IFCONT]]
|
|
// CHECK3: ifcont:
|
|
// CHECK3-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
|
|
// CHECK3-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
|
|
// CHECK3-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
|
|
// CHECK3: then5:
|
|
// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
|
|
// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
|
|
// CHECK3-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
|
|
// CHECK3-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
|
|
// CHECK3-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4
|
|
// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
|
|
// CHECK3-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
|
|
// CHECK3-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
|
|
// CHECK3-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
|
|
// CHECK3-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2
|
|
// CHECK3-NEXT: br label [[IFCONT7:%.*]]
|
|
// CHECK3: else6:
|
|
// CHECK3-NEXT: br label [[IFCONT7]]
|
|
// CHECK3: ifcont7:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
// CHECK3: then:
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK3-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
|
|
// CHECK3-NEXT: br label [[IFCONT:%.*]]
|
|
// CHECK3: else:
|
|
// CHECK3-NEXT: br label [[IFCONT]]
|
|
// CHECK3: ifcont:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
|
|
// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
|
|
// CHECK3: then2:
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
|
|
// CHECK3-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4
|
|
// CHECK3-NEXT: br label [[IFCONT4:%.*]]
|
|
// CHECK3: else3:
|
|
// CHECK3-NEXT: br label [[IFCONT4]]
|
|
// CHECK3: ifcont4:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
|
|
// CHECK3-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
|
|
// CHECK3: then6:
|
|
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
|
|
// CHECK3-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
|
|
// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
|
|
// CHECK3-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
|
|
// CHECK3-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
|
|
// CHECK3-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
|
|
// CHECK3-NEXT: br label [[IFCONT8:%.*]]
|
|
// CHECK3: else7:
|
|
// CHECK3-NEXT: br label [[IFCONT8]]
|
|
// CHECK3: ifcont8:
|
|
// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
|
|
// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
|
|
// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
|
|
// CHECK3: then10:
|
|
// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
|
|
// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
|
|
// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4
|
|
// CHECK3-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
|
|
// CHECK3-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2
|
|
// CHECK3-NEXT: br label [[IFCONT12:%.*]]
|
|
// CHECK3: else11:
|
|
// CHECK3-NEXT: br label [[IFCONT12]]
|
|
// CHECK3: ifcont12:
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
|
|
// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
|
|
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
|
|
// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
|
|
// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
|
|
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
|
|
// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
|
|
// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2
|
|
// CHECK3-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20
|
|
// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR2]] {
|
|
// CHECK3-NEXT: entry:
|
|
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
|
|
// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4
|
|
// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
|
|
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
|
|
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
|
|
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
|
|
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
|
|
// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4
|
|
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
|
|
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
|
|
// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
|
|
// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4
|
|
// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
|
|
// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
|
|
// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
|
|
// CHECK3-NEXT: ret void
|
|
//
|