Summary: This removes the use of OpenMP offloading to build the device runtime. The main benefit here is that we no longer need to rely on offloading semantics to build a device only runtime. Things like variants are now no longer needed and can just be simple if-defs. In the future, I will remove most of the special handling here and fold it into calls to the `<gpuintrin.h>` functions instead. Additionally I will rework the compilation to make this a separate runtime. The current plan is to have this, but make including OpenMP and offloading either automatically add it, or print a warning if it's missing. This will allow us to use a normal CMake workflow and delete all the weird 'lets pull the clang binary out of the build' business. ``` -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=offload -DLLVM_RUNTIME_TARGETS=amdgcn-amd-amdhsa ``` After that, linking the OpenMP device runtime will be `-Xoffload-linker -lomp`. I.e. no more fat binary business. Only look at the most recent commit since this includes the two dependencies (fix to AMDGPUEmitPrintfBinding and the PointerToMember bug).
312 lines
11 KiB
C++
312 lines
11 KiB
C++
//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Parallel implementation in the GPU. Here is the pattern:
|
|
//
|
|
// while (not finished) {
|
|
//
|
|
// if (master) {
|
|
// sequential code, decide which par loop to do, or if finished
|
|
// __kmpc_kernel_prepare_parallel() // exec by master only
|
|
// }
|
|
// syncthreads // A
|
|
// __kmpc_kernel_parallel() // exec by all
|
|
// if (this thread is included in the parallel) {
|
|
// switch () for all parallel loops
|
|
// __kmpc_kernel_end_parallel() // exec only by threads in parallel
|
|
// }
|
|
//
|
|
//
|
|
// The reason we don't exec end_parallel for the threads not included
|
|
// in the parallel loop is that for each barrier in the parallel
|
|
// region, these non-included threads will cycle through the
|
|
// syncthread A. Thus they must preserve their current threadId that
|
|
// is larger than thread in team.
|
|
//
|
|
// To make a long story short...
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "Debug.h"
|
|
#include "DeviceTypes.h"
|
|
#include "DeviceUtils.h"
|
|
#include "Interface.h"
|
|
#include "LibC.h"
|
|
#include "Mapping.h"
|
|
#include "State.h"
|
|
#include "Synchronization.h"
|
|
|
|
using namespace ompx;
|
|
|
|
namespace {
|
|
|
|
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
|
|
uint32_t NThreadsICV =
|
|
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
|
|
uint32_t NumThreads = mapping::getMaxTeamThreads();
|
|
|
|
if (NThreadsICV != 0 && NThreadsICV < NumThreads)
|
|
NumThreads = NThreadsICV;
|
|
|
|
// SPMD mode allows any number of threads, for generic mode we round down to a
|
|
// multiple of WARPSIZE since it is legal to do so in OpenMP.
|
|
if (mapping::isSPMDMode())
|
|
return NumThreads;
|
|
|
|
if (NumThreads < mapping::getWarpSize())
|
|
NumThreads = 1;
|
|
else
|
|
NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
|
|
|
|
return NumThreads;
|
|
}
|
|
|
|
// Invoke an outlined parallel function unwrapping arguments (up to 32).
|
|
[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
|
|
int32_t bound_tid, void *fn,
|
|
void **args, int64_t nargs) {
|
|
switch (nargs) {
|
|
#include "generated_microtask_cases.gen"
|
|
default:
|
|
printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
|
|
__builtin_trap();
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
extern "C" {
|
|
|
|
[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
|
|
int32_t num_threads,
|
|
void *fn, void **args,
|
|
const int64_t nargs) {
|
|
uint32_t TId = mapping::getThreadIdInBlock();
|
|
uint32_t NumThreads = determineNumberOfThreads(num_threads);
|
|
uint32_t PTeamSize =
|
|
NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
|
|
// Avoid the race between the read of the `icv::Level` above and the write
|
|
// below by synchronizing all threads here.
|
|
synchronize::threadsAligned(atomic::seq_cst);
|
|
{
|
|
// Note that the order here is important. `icv::Level` has to be updated
|
|
// last or the other updates will cause a thread specific state to be
|
|
// created.
|
|
state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
|
|
1u, TId == 0, ident,
|
|
/*ForceTeamState=*/true);
|
|
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
|
|
/*ForceTeamState=*/true);
|
|
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
|
|
/*ForceTeamState=*/true);
|
|
|
|
// Synchronize all threads after the main thread (TId == 0) set up the
|
|
// team state properly.
|
|
synchronize::threadsAligned(atomic::acq_rel);
|
|
|
|
state::ParallelTeamSize.assert_eq(PTeamSize, ident,
|
|
/*ForceTeamState=*/true);
|
|
icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true);
|
|
icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true);
|
|
|
|
// Ensure we synchronize before we run user code to avoid invalidating the
|
|
// assumptions above.
|
|
synchronize::threadsAligned(atomic::relaxed);
|
|
|
|
if (!PTeamSize || TId < PTeamSize)
|
|
invokeMicrotask(TId, 0, fn, args, nargs);
|
|
|
|
// Synchronize all threads at the end of a parallel region.
|
|
synchronize::threadsAligned(atomic::seq_cst);
|
|
}
|
|
|
|
// Synchronize all threads to make sure every thread exits the scope above;
|
|
// otherwise the following assertions and the assumption in
|
|
// __kmpc_target_deinit may not hold.
|
|
synchronize::threadsAligned(atomic::acq_rel);
|
|
|
|
state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true);
|
|
icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true);
|
|
icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true);
|
|
|
|
// Ensure we synchronize to create an aligned region around the assumptions.
|
|
synchronize::threadsAligned(atomic::relaxed);
|
|
|
|
return;
|
|
}
|
|
|
|
[[clang::always_inline]] void
|
|
__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
|
|
int32_t num_threads, int proc_bind, void *fn,
|
|
void *wrapper_fn, void **args, int64_t nargs) {
|
|
uint32_t TId = mapping::getThreadIdInBlock();
|
|
|
|
// Assert the parallelism level is zero if disabled by the user.
|
|
ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
|
|
"nested parallelism while disabled");
|
|
|
|
// Handle the serialized case first, same for SPMD/non-SPMD:
|
|
// 1) if-clause(0)
|
|
// 2) parallel in task or other thread state inducing construct
|
|
// 3) nested parallel regions
|
|
if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
|
|
(config::mayUseNestedParallelism() && icv::Level))) {
|
|
state::DateEnvironmentRAII DERAII(ident);
|
|
++icv::Level;
|
|
invokeMicrotask(TId, 0, fn, args, nargs);
|
|
return;
|
|
}
|
|
|
|
// From this point forward we know that there is no thread state used.
|
|
ASSERT(state::HasThreadState == false, nullptr);
|
|
|
|
if (mapping::isSPMDMode()) {
|
|
// This was moved to its own routine so it could be called directly
|
|
// in certain situations to avoid resource consumption of unused
|
|
// logic in parallel_51.
|
|
__kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
|
|
|
|
return;
|
|
}
|
|
|
|
uint32_t NumThreads = determineNumberOfThreads(num_threads);
|
|
uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
|
|
uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
|
|
|
|
// We do *not* create a new data environment because all threads in the team
|
|
// that are active are now running this parallel region. They share the
|
|
// TeamState, which has an increase level-var and potentially active-level
|
|
// set, but they do not have individual ThreadStates yet. If they ever
|
|
// modify the ICVs beyond this point a ThreadStates will be allocated.
|
|
|
|
bool IsActiveParallelRegion = NumThreads > 1;
|
|
if (!IsActiveParallelRegion) {
|
|
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
|
|
invokeMicrotask(TId, 0, fn, args, nargs);
|
|
return;
|
|
}
|
|
|
|
void **GlobalArgs = nullptr;
|
|
if (nargs) {
|
|
__kmpc_begin_sharing_variables(&GlobalArgs, nargs);
|
|
switch (nargs) {
|
|
default:
|
|
for (int I = 0; I < nargs; I++)
|
|
GlobalArgs[I] = args[I];
|
|
break;
|
|
case 16:
|
|
GlobalArgs[15] = args[15];
|
|
[[fallthrough]];
|
|
case 15:
|
|
GlobalArgs[14] = args[14];
|
|
[[fallthrough]];
|
|
case 14:
|
|
GlobalArgs[13] = args[13];
|
|
[[fallthrough]];
|
|
case 13:
|
|
GlobalArgs[12] = args[12];
|
|
[[fallthrough]];
|
|
case 12:
|
|
GlobalArgs[11] = args[11];
|
|
[[fallthrough]];
|
|
case 11:
|
|
GlobalArgs[10] = args[10];
|
|
[[fallthrough]];
|
|
case 10:
|
|
GlobalArgs[9] = args[9];
|
|
[[fallthrough]];
|
|
case 9:
|
|
GlobalArgs[8] = args[8];
|
|
[[fallthrough]];
|
|
case 8:
|
|
GlobalArgs[7] = args[7];
|
|
[[fallthrough]];
|
|
case 7:
|
|
GlobalArgs[6] = args[6];
|
|
[[fallthrough]];
|
|
case 6:
|
|
GlobalArgs[5] = args[5];
|
|
[[fallthrough]];
|
|
case 5:
|
|
GlobalArgs[4] = args[4];
|
|
[[fallthrough]];
|
|
case 4:
|
|
GlobalArgs[3] = args[3];
|
|
[[fallthrough]];
|
|
case 3:
|
|
GlobalArgs[2] = args[2];
|
|
[[fallthrough]];
|
|
case 2:
|
|
GlobalArgs[1] = args[1];
|
|
[[fallthrough]];
|
|
case 1:
|
|
GlobalArgs[0] = args[0];
|
|
[[fallthrough]];
|
|
case 0:
|
|
break;
|
|
}
|
|
}
|
|
|
|
{
|
|
// Note that the order here is important. `icv::Level` has to be updated
|
|
// last or the other updates will cause a thread specific state to be
|
|
// created.
|
|
state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
|
|
1u, true, ident,
|
|
/*ForceTeamState=*/true);
|
|
state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
|
|
(void *)nullptr, true, ident,
|
|
/*ForceTeamState=*/true);
|
|
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
|
|
/*ForceTeamState=*/true);
|
|
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
|
|
/*ForceTeamState=*/true);
|
|
|
|
// Master signals work to activate workers.
|
|
synchronize::threads(atomic::seq_cst);
|
|
// Master waits for workers to signal.
|
|
synchronize::threads(atomic::seq_cst);
|
|
}
|
|
|
|
if (nargs)
|
|
__kmpc_end_sharing_variables();
|
|
}
|
|
|
|
[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
|
|
// Work function and arguments for L1 parallel region.
|
|
*WorkFn = state::ParallelRegionFn;
|
|
|
|
// If this is the termination signal from the master, quit early.
|
|
if (!*WorkFn)
|
|
return false;
|
|
|
|
// Set to true for workers participating in the parallel region.
|
|
uint32_t TId = mapping::getThreadIdInBlock();
|
|
bool ThreadIsActive = TId < state::getEffectivePTeamSize();
|
|
return ThreadIsActive;
|
|
}
|
|
|
|
[[clang::noinline]] void __kmpc_kernel_end_parallel() {
|
|
// In case we have modified an ICV for this thread before a ThreadState was
|
|
// created. We drop it now to not contaminate the next parallel region.
|
|
ASSERT(!mapping::isSPMDMode(), nullptr);
|
|
uint32_t TId = mapping::getThreadIdInBlock();
|
|
state::resetStateForThread(TId);
|
|
ASSERT(!mapping::isSPMDMode(), nullptr);
|
|
}
|
|
|
|
uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
|
|
|
|
int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
|
|
|
|
void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
|
|
int32_t thread_limit) {}
|
|
|
|
void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
|
|
}
|