Summary: This removes the use of OpenMP offloading to build the device runtime. The main benefit here is that we no longer need to rely on offloading semantics to build a device only runtime. Things like variants are now no longer needed and can just be simple if-defs. In the future, I will remove most of the special handling here and fold it into calls to the `<gpuintrin.h>` functions instead. Additionally I will rework the compilation to make this a separate runtime. The current plan is to have this, but make including OpenMP and offloading either automatically add it, or print a warning if it's missing. This will allow us to use a normal CMake workflow and delete all the weird 'lets pull the clang binary out of the build' business. ``` -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=offload -DLLVM_RUNTIME_TARGETS=amdgcn-amd-amdhsa ``` After that, linking the OpenMP device runtime will be `-Xoffload-linker -lomp`. I.e. no more fat binary business. Only look at the most recent commit since this includes the two dependencies (fix to AMDGPUEmitPrintfBinding and the PointerToMember bug).
156 lines
5.5 KiB
C++
156 lines
5.5 KiB
C++
//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file contains the kernel entry points for the device.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "Shared/Environment.h"
|
|
|
|
#include "Allocator.h"
|
|
#include "Debug.h"
|
|
#include "DeviceTypes.h"
|
|
#include "Interface.h"
|
|
#include "Mapping.h"
|
|
#include "State.h"
|
|
#include "Synchronization.h"
|
|
#include "Workshare.h"
|
|
|
|
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
|
|
|
|
using namespace ompx;
|
|
|
|
static void
|
|
inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
|
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
|
|
// Order is important here.
|
|
synchronize::init(IsSPMD);
|
|
mapping::init(IsSPMD);
|
|
state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
|
|
allocator::init(IsSPMD, KernelEnvironment);
|
|
workshare::init(IsSPMD);
|
|
}
|
|
|
|
/// Simple generic state machine for worker threads.
|
|
static void genericStateMachine(IdentTy *Ident) {
|
|
uint32_t TId = mapping::getThreadIdInBlock();
|
|
|
|
do {
|
|
ParallelRegionFnTy WorkFn = nullptr;
|
|
|
|
// Wait for the signal that we have a new work function.
|
|
synchronize::threads(atomic::seq_cst);
|
|
|
|
// Retrieve the work function from the runtime.
|
|
bool IsActive = __kmpc_kernel_parallel(&WorkFn);
|
|
|
|
// If there is nothing more to do, break out of the state machine by
|
|
// returning to the caller.
|
|
if (!WorkFn)
|
|
return;
|
|
|
|
if (IsActive) {
|
|
ASSERT(!mapping::isSPMDMode(), nullptr);
|
|
((void (*)(uint32_t, uint32_t))WorkFn)(0, TId);
|
|
__kmpc_kernel_end_parallel();
|
|
}
|
|
|
|
synchronize::threads(atomic::seq_cst);
|
|
|
|
} while (true);
|
|
}
|
|
|
|
extern "C" {
|
|
|
|
/// Initialization
|
|
///
|
|
/// \param Ident Source location identification, can be NULL.
|
|
///
|
|
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
|
|
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
|
|
ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
|
|
bool IsSPMD = Configuration.ExecMode &
|
|
llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
|
|
bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
|
|
if (IsSPMD) {
|
|
inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
|
|
KernelLaunchEnvironment);
|
|
synchronize::threadsAligned(atomic::relaxed);
|
|
} else {
|
|
inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
|
|
KernelLaunchEnvironment);
|
|
// No need to wait since only the main threads will execute user
|
|
// code and workers will run into a barrier right away.
|
|
}
|
|
|
|
if (IsSPMD) {
|
|
state::assumeInitialState(IsSPMD);
|
|
|
|
// Synchronize to ensure the assertions above are in an aligned region.
|
|
// The barrier is eliminated later.
|
|
synchronize::threadsAligned(atomic::relaxed);
|
|
return -1;
|
|
}
|
|
|
|
if (mapping::isInitialThreadInLevel0(IsSPMD))
|
|
return -1;
|
|
|
|
// Enter the generic state machine if enabled and if this thread can possibly
|
|
// be an active worker thread.
|
|
//
|
|
// The latter check is important for NVIDIA Pascal (but not Volta) and AMD
|
|
// GPU. In those cases, a single thread can apparently satisfy a barrier on
|
|
// behalf of all threads in the same warp. Thus, it would not be safe for
|
|
// other threads in the main thread's warp to reach the first
|
|
// synchronize::threads call in genericStateMachine before the main thread
|
|
// reaches its corresponding synchronize::threads call: that would permit all
|
|
// active worker threads to proceed before the main thread has actually set
|
|
// state::ParallelRegionFn, and then they would immediately quit without
|
|
// doing any work. mapping::getMaxTeamThreads() does not include any of the
|
|
// main thread's warp, so none of its threads can ever be active worker
|
|
// threads.
|
|
if (UseGenericStateMachine &&
|
|
mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD))
|
|
genericStateMachine(KernelEnvironment.Ident);
|
|
|
|
return mapping::getThreadIdInBlock();
|
|
}
|
|
|
|
/// De-Initialization
|
|
///
|
|
/// In non-SPMD, this function releases the workers trapped in a state machine
|
|
/// and also any memory dynamically allocated by the runtime.
|
|
///
|
|
/// \param Ident Source location identification, can be NULL.
|
|
///
|
|
void __kmpc_target_deinit() {
|
|
bool IsSPMD = mapping::isSPMDMode();
|
|
if (IsSPMD)
|
|
return;
|
|
|
|
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
|
|
// Signal the workers to exit the state machine and exit the kernel.
|
|
state::ParallelRegionFn = nullptr;
|
|
} else if (!state::getKernelEnvironment()
|
|
.Configuration.UseGenericStateMachine) {
|
|
// Retrieve the work function just to ensure we always call
|
|
// __kmpc_kernel_parallel even if a custom state machine is used.
|
|
// TODO: this is not super pretty. The problem is we create the call to
|
|
// __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it
|
|
// is not there yet. Thus, we assume we never reach it from
|
|
// __kmpc_target_deinit. That allows us to remove the store in there to
|
|
// ParallelRegionFn, which leads to bad results later on.
|
|
ParallelRegionFnTy WorkFn = nullptr;
|
|
__kmpc_kernel_parallel(&WorkFn);
|
|
ASSERT(WorkFn == nullptr, nullptr);
|
|
}
|
|
}
|
|
|
|
int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
|
|
}
|