Before we tracked the size of the teams reduction buffer in order to allocate it at runtime per kernel launch. This patch splits the number into two parts, the size of the reduction data (=all reduction variables) and the (maximal) length of the buffer. This will allow us to allocate less if we need less, e.g., if we have less teams than the maximal length. It also allows us to move code from clangs codegen into the runtime as we now know how large the reduction data is.
67 lines
3.1 KiB
LLVM
67 lines
3.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
|
|
; RUN: opt -passes=openmp-opt-cgscc -S < %s | FileCheck %s
|
|
|
|
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
|
target triple = "nvptx64"
|
|
|
|
%struct.ident_t = type { i32, i32, i32, i32, ptr }
|
|
%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
|
|
%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
|
|
|
|
@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
|
|
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
|
|
@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, ptr @0 }, align 8
|
|
@__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
|
|
|
|
declare void @use(i32)
|
|
|
|
define weak void @__omp_offloading_50_a3e09bf8_foo_l2(ptr %dyn) #0 {
|
|
; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_50_a3e09bf8_foo_l2
|
|
; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
|
|
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment, ptr [[DYN]])
|
|
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
; CHECK: user_code.entry:
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
|
|
; CHECK-NEXT: call void @__kmpc_target_deinit()
|
|
; CHECK-NEXT: ret void
|
|
; CHECK: worker.exit:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%captured_vars_addrs = alloca [0 x ptr], align 8
|
|
%0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment, ptr %dyn)
|
|
%exec_user_code = icmp eq i32 %0, -1
|
|
br i1 %exec_user_code, label %user_code.entry, label %worker.exit
|
|
|
|
user_code.entry: ; preds = %entry
|
|
%1 = call i32 @__kmpc_global_thread_num(ptr @2)
|
|
%2 = call i32 @__kmpc_global_thread_num(ptr @2)
|
|
call void @__kmpc_target_deinit()
|
|
ret void
|
|
|
|
worker.exit: ; preds = %entry
|
|
ret void
|
|
}
|
|
|
|
declare i32 @__kmpc_target_init(ptr, ptr)
|
|
|
|
declare i32 @__kmpc_global_thread_num(ptr) #1
|
|
|
|
declare void @__kmpc_target_deinit()
|
|
|
|
attributes #0 = { convergent noinline norecurse nounwind "kernel" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
|
|
attributes #1 = { nounwind }
|
|
|
|
!omp_offload.info = !{!0}
|
|
!nvvm.annotations = !{!1}
|
|
!llvm.module.flags = !{!2, !3, !4}
|
|
|
|
!0 = !{i32 0, i32 80, i32 -1545561096, !"foo", i32 2, i32 0}
|
|
!1 = !{ptr @__omp_offloading_50_a3e09bf8_foo_l2, !"kernel", i32 1}
|
|
!2 = !{i32 1, !"wchar_size", i32 4}
|
|
!3 = !{i32 7, !"openmp", i32 50}
|
|
!4 = !{i32 7, !"openmp-device", i32 50}
|