The changes made in D123460 generalized the code generation for OpenMP's
offloading entries. We can use the same scheme to register globals for
CUDA code. This patch adds the code generation to create these
offloading entries when compiling using the new offloading driver mode.
The offloading entries are simple structs that contain the information
necessary to register the global. The struct used is as follows:
```
Type struct __tgt_offload_entry {
void *addr; // Pointer to the offload entry info.
// (function or global)
char *name; // Name of the function or global.
size_t size; // Size of the entry info (0 if it a function).
int32_t flags;
int32_t reserved;
};
```
Currently CUDA handles RDC code generation by deferring the registration
of globals in the current TU to a callback function containing the
modules ID. Later all the module IDs will be used to register all of the
globals at once. Rather than mimic this, offloading entries allow us to
mimic the way OpenMP registers globals. That is, we create a simple
global struct for each device global to be registered. These are placed
at a special section `cuda_offloading_entires`. Because this section is
a valid C-identifier, the linker will profide a `__start` and `__stop`
pointer that we can use to iterate and register all globals at runtime.
the registration requires a flag variable to indicate which registration
function to use. I have assigned the flags somewhat arbitrarily, but
these use the following values.
Kernel: 0
Variable: 0
Managed: 1
Surface: 2
Texture: 3
Depends on D120272
Reviewed By: tra
Differential Revision: https://reviews.llvm.org/D123471
34 lines
1.8 KiB
Plaintext
34 lines
1.8 KiB
Plaintext
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals
|
|
// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu \
|
|
// RUN: --offload-new-driver -emit-llvm -o - -x cuda %s | FileCheck \
|
|
// RUN: --check-prefix=HOST %s
|
|
|
|
#include "Inputs/cuda.h"
|
|
|
|
//.
|
|
// HOST: @x = internal global i32 undef, align 4
|
|
// HOST: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
|
|
// HOST: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
|
|
// HOST: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
|
|
// HOST: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
|
|
// HOST: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
|
|
// HOST: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
|
|
//.
|
|
// HOST-LABEL: @_Z18__device_stub__foov(
|
|
// HOST-NEXT: entry:
|
|
// HOST-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__foov)
|
|
// HOST-NEXT: br label [[SETUP_END:%.*]]
|
|
// HOST: setup.end:
|
|
// HOST-NEXT: ret void
|
|
//
|
|
__global__ void foo() {}
|
|
// HOST-LABEL: @_Z18__device_stub__barv(
|
|
// HOST-NEXT: entry:
|
|
// HOST-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv)
|
|
// HOST-NEXT: br label [[SETUP_END:%.*]]
|
|
// HOST: setup.end:
|
|
// HOST-NEXT: ret void
|
|
//
|
|
__global__ void bar() {}
|
|
__device__ int x = 1;
|