Summary: This removes the use of OpenMP offloading to build the device runtime. The main benefit here is that we no longer need to rely on offloading semantics to build a device only runtime. Things like variants are now no longer needed and can just be simple if-defs. In the future, I will remove most of the special handling here and fold it into calls to the `<gpuintrin.h>` functions instead. Additionally I will rework the compilation to make this a separate runtime. The current plan is to have this, but make including OpenMP and offloading either automatically add it, or print a warning if it's missing. This will allow us to use a normal CMake workflow and delete all the weird 'lets pull the clang binary out of the build' business. ``` -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=offload -DLLVM_RUNTIME_TARGETS=amdgcn-amd-amdhsa ``` After that, linking the OpenMP device runtime will be `-Xoffload-linker -lomp`. I.e. no more fat binary business. Only look at the most recent commit since this includes the two dependencies (fix to AMDGPUEmitPrintfBinding and the PointerToMember bug).
78 lines
2.4 KiB
C++
78 lines
2.4 KiB
C++
//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "Shared/Environment.h"
|
|
|
|
#include "Allocator.h"
|
|
#include "Configuration.h"
|
|
#include "DeviceTypes.h"
|
|
#include "DeviceUtils.h"
|
|
#include "Mapping.h"
|
|
#include "Synchronization.h"
|
|
|
|
using namespace ompx;
|
|
|
|
[[gnu::used, gnu::retain, gnu::weak,
|
|
gnu::visibility(
|
|
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
|
|
[[gnu::used, gnu::retain, gnu::weak,
|
|
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
|
|
__omp_rtl_device_memory_pool_tracker;
|
|
|
|
/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
|
|
/// directly.
|
|
struct BumpAllocatorTy final {
|
|
|
|
void *alloc(uint64_t Size) {
|
|
Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
|
|
|
|
if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
|
|
atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
|
|
atomic::seq_cst);
|
|
atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
|
|
atomic::seq_cst);
|
|
atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
|
|
atomic::seq_cst);
|
|
atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
|
|
atomic::seq_cst);
|
|
}
|
|
|
|
uint64_t *Data =
|
|
reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
|
|
uint64_t End =
|
|
reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
|
|
|
|
uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
|
|
if (OldData + Size > End)
|
|
__builtin_trap();
|
|
|
|
return reinterpret_cast<void *>(OldData);
|
|
}
|
|
|
|
void free(void *) {}
|
|
};
|
|
|
|
BumpAllocatorTy BumpAllocator;
|
|
|
|
/// allocator namespace implementation
|
|
///
|
|
///{
|
|
|
|
void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
|
|
// TODO: Check KernelEnvironment for an allocator choice as soon as we have
|
|
// more than one.
|
|
}
|
|
|
|
void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
|
|
|
|
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
|
|
|
|
///}
|