Before we tracked the size of the teams reduction buffer in order to allocate it at runtime per kernel launch. This patch splits the number into two parts, the size of the reduction data (=all reduction variables) and the (maximal) length of the buffer. This will allow us to allocate less if we need less, e.g., if we have less teams than the maximal length. It also allows us to move code from clangs codegen into the runtime as we now know how large the reduction data is.
169 lines
9.1 KiB
LLVM
169 lines
9.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
|
|
; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
|
|
|
|
; void foo(double x) {
|
|
; #pragma omp target map(from:x)
|
|
; {
|
|
; x = sin(M_PI);
|
|
; #pragma omp parallel
|
|
; { }
|
|
; }
|
|
; }
|
|
|
|
target triple = "nvptx64"
|
|
|
|
%struct.ident_t = type { i32, i32, i32, i32, ptr }
|
|
%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
|
|
%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
|
|
|
|
@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
|
|
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
|
|
@__omp_offloading_fd02_404433c2_main_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
|
|
|
|
|
|
; Function Attrs: alwaysinline convergent norecurse nounwind
|
|
;.
|
|
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
|
|
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
|
|
; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
|
|
; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
|
|
;.
|
|
define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
|
|
; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_404433c2_main_l5
|
|
; CHECK-SAME: (ptr [[DYN:%.*]], ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
|
|
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_fd02_404433c2_main_l5_kernel_environment, ptr [[DYN]]) #[[ATTR3:[0-9]+]]
|
|
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
|
|
; CHECK: common.ret:
|
|
; CHECK-NEXT: ret void
|
|
; CHECK: user_code.entry:
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR3]]
|
|
; CHECK-NEXT: [[CALL_I:%.*]] = call double @__nv_sin(double 0x400921FB54442D18) #[[ATTR7:[0-9]+]]
|
|
; CHECK-NEXT: br label [[REGION_CHECK_TID:%.*]]
|
|
; CHECK: region.check.tid:
|
|
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
|
|
; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
|
|
; CHECK: region.guarded:
|
|
; CHECK-NEXT: store double [[CALL_I]], ptr [[X]], align 8, !tbaa [[TBAA8:![0-9]+]]
|
|
; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]]
|
|
; CHECK: region.guarded.end:
|
|
; CHECK-NEXT: br label [[REGION_BARRIER]]
|
|
; CHECK: region.barrier:
|
|
; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP2]])
|
|
; CHECK-NEXT: br label [[REGION_EXIT:%.*]]
|
|
; CHECK: region.exit:
|
|
; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr @__omp_outlined___wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS]], i64 0) #[[ATTR3]]
|
|
; CHECK-NEXT: call void @__kmpc_target_deinit() #[[ATTR3]]
|
|
; CHECK-NEXT: br label [[COMMON_RET]]
|
|
;
|
|
entry:
|
|
%captured_vars_addrs = alloca [0 x ptr], align 8
|
|
%0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_fd02_404433c2_main_l5_kernel_environment, ptr %dyn) #3
|
|
%exec_user_code = icmp eq i32 %0, -1
|
|
br i1 %exec_user_code, label %user_code.entry, label %common.ret
|
|
|
|
common.ret: ; preds = %entry, %user_code.entry
|
|
ret void
|
|
|
|
user_code.entry: ; preds = %entry
|
|
%1 = call i32 @__kmpc_global_thread_num(ptr nonnull @1)
|
|
%call.i = call double @__nv_sin(double 0x400921FB54442D18) #6
|
|
store double %call.i, ptr %x, align 8, !tbaa !8
|
|
call void @__kmpc_parallel_51(ptr nonnull @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr @__omp_outlined___wrapper, ptr nonnull %captured_vars_addrs, i64 0) #3
|
|
call void @__kmpc_target_deinit() #3
|
|
br label %common.ret
|
|
}
|
|
|
|
declare i32 @__kmpc_target_init(ptr, ptr) local_unnamed_addr
|
|
|
|
; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind readnone willreturn
|
|
define internal void @__omp_outlined__(ptr noalias nocapture %.global_tid., ptr noalias nocapture %.bound_tid.) #1 {
|
|
; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
; CHECK-SAME: (ptr noalias nocapture [[DOTGLOBAL_TID_:%.*]], ptr noalias nocapture [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
ret void
|
|
}
|
|
|
|
; Function Attrs: norecurse nounwind
|
|
define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 {
|
|
; CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
|
|
; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
|
|
; CHECK-NEXT: call void @__kmpc_get_shared_variables(ptr nonnull [[GLOBAL_ARGS]]) #[[ATTR3]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%global_args = alloca ptr, align 8
|
|
call void @__kmpc_get_shared_variables(ptr nonnull %global_args) #3
|
|
ret void
|
|
}
|
|
|
|
declare void @__kmpc_get_shared_variables(ptr) local_unnamed_addr
|
|
|
|
; Function Attrs: nounwind
|
|
declare i32 @__kmpc_global_thread_num(ptr) local_unnamed_addr #3
|
|
|
|
; Function Attrs: alwaysinline
|
|
declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64) local_unnamed_addr #4
|
|
|
|
declare void @__kmpc_target_deinit() local_unnamed_addr
|
|
|
|
; Function Attrs: convergent
|
|
declare double @__nv_sin(double) local_unnamed_addr #5
|
|
|
|
attributes #0 = { alwaysinline convergent norecurse nounwind "kernel" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
|
|
attributes #1 = { alwaysinline mustprogress nofree norecurse nosync nounwind readnone willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
|
|
attributes #2 = { norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
|
|
attributes #3 = { nounwind }
|
|
attributes #4 = { alwaysinline }
|
|
attributes #5 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
|
|
attributes #6 = { convergent nounwind "llvm.assume"="ompx_spmd_amenable" }
|
|
|
|
!omp_offload.info = !{!0}
|
|
!nvvm.annotations = !{!1}
|
|
!llvm.module.flags = !{!2, !3, !4, !5, !6}
|
|
!llvm.ident = !{!7}
|
|
|
|
!0 = !{i32 0, i32 64770, i32 1078211522, !"main", i32 5, i32 0}
|
|
!1 = !{ptr @__omp_offloading_fd02_404433c2_main_l5, !"kernel", i32 1}
|
|
!2 = !{i32 1, !"wchar_size", i32 4}
|
|
!3 = !{i32 7, !"openmp", i32 50}
|
|
!4 = !{i32 7, !"openmp-device", i32 50}
|
|
!5 = !{i32 8, !"PIC Level", i32 2}
|
|
!6 = !{i32 7, !"frame-pointer", i32 2}
|
|
!7 = !{!"clang version 14.0.0"}
|
|
!8 = !{!9, !9, i64 0}
|
|
!9 = !{!"double", !10, i64 0}
|
|
!10 = !{!"omnipotent char", !11, i64 0}
|
|
!11 = !{!"Simple C/C++ TBAA"}
|
|
;.
|
|
; CHECK: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "frame-pointer"="all" "kernel" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
|
|
; CHECK: attributes #[[ATTR1]] = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(none) "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
|
|
; CHECK: attributes #[[ATTR2]] = { norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
|
|
; CHECK: attributes #[[ATTR3]] = { nounwind }
|
|
; CHECK: attributes #[[ATTR4:[0-9]+]] = { alwaysinline }
|
|
; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
|
|
; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nounwind }
|
|
; CHECK: attributes #[[ATTR7]] = { convergent nounwind "llvm.assume"="ompx_spmd_amenable" }
|
|
;.
|
|
; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 1078211522, !"main", i32 5, i32 0}
|
|
; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_fd02_404433c2_main_l5, !"kernel", i32 1}
|
|
; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
|
|
; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
|
|
; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
|
|
; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
|
|
; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
|
|
; CHECK: [[META7:![0-9]+]] = !{!"clang version 14.0.0"}
|
|
; CHECK: [[TBAA8]] = !{!9, !9, i64 0}
|
|
; CHECK: [[META9:![0-9]+]] = !{!"double", !10, i64 0}
|
|
; CHECK: [[META10:![0-9]+]] = !{!"omnipotent char", !11, i64 0}
|
|
; CHECK: [[META11:![0-9]+]] = !{!"Simple C/C++ TBAA"}
|
|
;.
|