Before we tracked the size of the teams reduction buffer in order to allocate it at runtime per kernel launch. This patch splits the number into two parts, the size of the reduction data (=all reduction variables) and the (maximal) length of the buffer. This will allow us to allocate less if we need less, e.g., if we have less teams than the maximal length. It also allows us to move code from clangs codegen into the runtime as we now know how large the reduction data is.
108 lines
5.4 KiB
LLVM
108 lines
5.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals
|
|
; RUN: opt < %s -S -passes=openmp-opt -openmp-opt-inline-device | FileCheck %s
|
|
|
|
%struct.ident_t = type { i32, i32, i32, i32, ptr }
|
|
%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
|
|
%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
|
|
@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
|
|
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
|
|
@G = external global i8
|
|
|
|
@kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
|
|
|
|
; Function Attrs: convergent norecurse nounwind
|
|
;.
|
|
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
|
|
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
|
|
; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
|
|
; CHECK: @[[KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
|
|
;.
|
|
define weak void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
|
|
; CHECK: Function Attrs: norecurse nounwind
|
|
; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_environment, ptr [[DYN:%.*]])
|
|
; CHECK-NEXT: [[THREAD_ID_IN_BLOCK:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
; CHECK-NEXT: [[THREAD_IS_MAIN:%.*]] = icmp ne i32 [[THREAD_ID_IN_BLOCK]], 0
|
|
; CHECK-NEXT: br i1 [[THREAD_IS_MAIN]], label [[EXIT_THREADS:%.*]], label [[MAIN_THREAD_USER_CODE:%.*]]
|
|
; CHECK: exit.threads:
|
|
; CHECK-NEXT: ret void
|
|
; CHECK: main.thread.user_code:
|
|
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
; CHECK: user_code.entry:
|
|
; CHECK-NEXT: store i8 1, ptr @G, align 1
|
|
; CHECK-NEXT: call void @__kmpc_target_deinit()
|
|
; CHECK-NEXT: ret void
|
|
; CHECK: worker.exit:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%0 = call i32 @__kmpc_target_init(ptr @kernel_environment, ptr %dyn)
|
|
%exec_user_code = icmp eq i32 %0, -1
|
|
br i1 %exec_user_code, label %user_code.entry, label %worker.exit
|
|
|
|
user_code.entry: ; preds = %entry
|
|
; Ensure we see a 0 here as the kernel doesn't have parallel regions and we want
|
|
; generic execution.
|
|
; TODO: This is not perfect. We should rather go for SPMD mode and tell the runtime
|
|
; to only spawn a single thread. Further, we then should not guard any code.
|
|
%isSPMD = call i8 @__kmpc_is_spmd_exec_mode()
|
|
store i8 %isSPMD, ptr @G
|
|
call void @bar() #2
|
|
call void @__kmpc_target_deinit()
|
|
ret void
|
|
|
|
worker.exit: ; preds = %entry
|
|
ret void
|
|
}
|
|
|
|
declare i8 @__kmpc_is_spmd_exec_mode()
|
|
|
|
declare i32 @__kmpc_target_init(ptr, ptr)
|
|
|
|
declare void @__kmpc_target_deinit()
|
|
|
|
; Function Attrs: convergent nounwind
|
|
define hidden void @bar() #1 {
|
|
; CHECK: Function Attrs: alwaysinline convergent nounwind
|
|
; CHECK-LABEL: @bar(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { convergent norecurse nounwind "kernel" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" }
|
|
attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" }
|
|
attributes #2 = { convergent }
|
|
|
|
!omp_offload.info = !{!0}
|
|
!nvvm.annotations = !{!1}
|
|
!llvm.module.flags = !{!2, !3, !4, !5, !6}
|
|
!llvm.ident = !{!7}
|
|
|
|
!0 = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
|
|
!1 = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
|
|
!2 = !{i32 1, !"wchar_size", i32 4}
|
|
!3 = !{i32 7, !"openmp", i32 50}
|
|
!4 = !{i32 7, !"openmp-device", i32 50}
|
|
!5 = !{i32 7, !"PIC Level", i32 2}
|
|
!6 = !{i32 7, !"frame-pointer", i32 2}
|
|
!7 = !{!"clang version 14.0.0"}
|
|
;.
|
|
; CHECK: attributes #[[ATTR0:[0-9]+]] = { norecurse nounwind "frame-pointer"="all" "kernel" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" }
|
|
; CHECK: attributes #[[ATTR1:[0-9]+]] = { alwaysinline convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" }
|
|
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
|
|
;.
|
|
; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
|
|
; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
|
|
; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
|
|
; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
|
|
; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
|
|
; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
|
|
; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
|
|
; CHECK: [[META7:![0-9]+]] = !{!"clang version 14.0.0"}
|
|
;.
|