This pull request is the second part of an ongoing effort to extends PGO instrumentation to GPU device code and depends on #76587. This PR makes the following changes: - Introduces `__llvm_write_custom_profile` to PGO compiler-rt library. This is an external function that can be used to write profiles with custom data to target-specific files. - Adds `__llvm_write_custom_profile` as weak symbol to libomptarget so that it can write the collected data to a profraw file. - Adds `PGODump` debug flag and only displays dump when the aforementioned flag is set
2201 lines
78 KiB
C++
2201 lines
78 KiB
C++
//===- PluginInterface.cpp - Target independent plugin device interface ---===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "PluginInterface.h"
|
|
|
|
#include "Shared/APITypes.h"
|
|
#include "Shared/Debug.h"
|
|
#include "Shared/Environment.h"
|
|
|
|
#include "ErrorReporting.h"
|
|
#include "GlobalHandler.h"
|
|
#include "JIT.h"
|
|
#include "Shared/Utils.h"
|
|
#include "Utils/ELF.h"
|
|
#include "omptarget.h"
|
|
|
|
#ifdef OMPT_SUPPORT
|
|
#include "OpenMP/OMPT/Callback.h"
|
|
#include "omp-tools.h"
|
|
#endif
|
|
|
|
#include "llvm/Bitcode/BitcodeReader.h"
|
|
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
|
#include "llvm/Support/Error.h"
|
|
#include "llvm/Support/JSON.h"
|
|
#include "llvm/Support/MathExtras.h"
|
|
#include "llvm/Support/MemoryBuffer.h"
|
|
#include "llvm/Support/Signals.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#include <cstdint>
|
|
#include <limits>
|
|
|
|
using namespace llvm;
|
|
using namespace omp;
|
|
using namespace target;
|
|
using namespace plugin;
|
|
|
|
// TODO: Fix any thread safety issues for multi-threaded kernel recording.
|
|
namespace llvm::omp::target::plugin {
|
|
struct RecordReplayTy {
|
|
|
|
// Describes the state of the record replay mechanism.
|
|
enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying };
|
|
|
|
private:
|
|
// Memory pointers for recording, replaying memory.
|
|
void *MemoryStart = nullptr;
|
|
void *MemoryPtr = nullptr;
|
|
size_t MemorySize = 0;
|
|
size_t TotalSize = 0;
|
|
GenericDeviceTy *Device = nullptr;
|
|
std::mutex AllocationLock;
|
|
|
|
RRStatusTy Status = RRDeactivated;
|
|
bool ReplaySaveOutput = false;
|
|
bool UsedVAMap = false;
|
|
uintptr_t MemoryOffset = 0;
|
|
|
|
// A list of all globals mapped to the device.
|
|
struct GlobalEntry {
|
|
const char *Name;
|
|
uint64_t Size;
|
|
void *Addr;
|
|
};
|
|
llvm::SmallVector<GlobalEntry> GlobalEntries{};
|
|
|
|
void *suggestAddress(uint64_t MaxMemoryAllocation) {
|
|
// Get a valid pointer address for this system
|
|
void *Addr =
|
|
Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
|
|
Device->free(Addr);
|
|
// Align Address to MaxMemoryAllocation
|
|
Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation);
|
|
return Addr;
|
|
}
|
|
|
|
Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
|
|
size_t ASize = MaxMemoryAllocation;
|
|
|
|
if (!VAddr && isRecording())
|
|
VAddr = suggestAddress(MaxMemoryAllocation);
|
|
|
|
DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
|
|
|
|
if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
|
|
return Err;
|
|
|
|
if (isReplaying() && VAddr != MemoryStart) {
|
|
return Plugin::error("Record-Replay cannot assign the"
|
|
"requested recorded address (%p, %p)",
|
|
VAddr, MemoryStart);
|
|
}
|
|
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
|
|
"Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
|
|
|
|
MemoryPtr = MemoryStart;
|
|
MemorySize = 0;
|
|
TotalSize = ASize;
|
|
UsedVAMap = true;
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error preAllocateHeuristic(uint64_t MaxMemoryAllocation,
|
|
uint64_t RequiredMemoryAllocation, void *VAddr) {
|
|
const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
|
|
constexpr size_t STEP = 1024 * 1024 * 1024ULL;
|
|
MemoryStart = nullptr;
|
|
for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
|
|
MemoryStart =
|
|
Device->allocate(TotalSize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
|
|
if (MemoryStart)
|
|
break;
|
|
}
|
|
if (!MemoryStart)
|
|
return Plugin::error("Allocating record/replay memory");
|
|
|
|
if (VAddr && VAddr != MemoryStart)
|
|
MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart);
|
|
|
|
MemoryPtr = MemoryStart;
|
|
MemorySize = 0;
|
|
|
|
// Check if we need adjustment.
|
|
if (MemoryOffset > 0 &&
|
|
TotalSize >= RequiredMemoryAllocation + MemoryOffset) {
|
|
// If we are off but "before" the required address and with enough space,
|
|
// we just "allocate" the offset to match the required address.
|
|
MemoryPtr = (char *)MemoryPtr + MemoryOffset;
|
|
MemorySize += MemoryOffset;
|
|
MemoryOffset = 0;
|
|
assert(MemoryPtr == VAddr && "Expected offset adjustment to work");
|
|
} else if (MemoryOffset) {
|
|
// If we are off and in a situation we cannot just "waste" memory to force
|
|
// a match, we hope adjusting the arguments is sufficient.
|
|
REPORT(
|
|
"WARNING Failed to allocate replay memory at required location %p, "
|
|
"got %p, trying to offset argument pointers by %" PRIi64 "\n",
|
|
VAddr, MemoryStart, MemoryOffset);
|
|
}
|
|
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
|
|
"Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
|
|
MemoryStart);
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
|
|
if (Device->supportVAManagement()) {
|
|
auto Err = preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
|
|
if (Err) {
|
|
REPORT("WARNING VA mapping failed, fallback to heuristic: "
|
|
"(Error: %s)\n",
|
|
toString(std::move(Err)).data());
|
|
}
|
|
}
|
|
|
|
uint64_t DevMemSize;
|
|
if (Device->getDeviceMemorySize(DevMemSize))
|
|
return Plugin::error("Cannot determine Device Memory Size");
|
|
|
|
return preAllocateHeuristic(DevMemSize, DeviceMemorySize, ReqVAddr);
|
|
}
|
|
|
|
void dumpDeviceMemory(StringRef Filename) {
|
|
ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
|
|
WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
|
|
if (!DeviceMemoryMB)
|
|
report_fatal_error("Error creating MemoryBuffer for device memory");
|
|
|
|
auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
|
|
MemoryStart, MemorySize, nullptr);
|
|
if (Err)
|
|
report_fatal_error("Error retrieving data for target pointer");
|
|
|
|
StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize);
|
|
std::error_code EC;
|
|
raw_fd_ostream OS(Filename, EC);
|
|
if (EC)
|
|
report_fatal_error("Error dumping memory to file " + Filename + " :" +
|
|
EC.message());
|
|
OS << DeviceMemory;
|
|
OS.close();
|
|
}
|
|
|
|
public:
|
|
bool isRecording() const { return Status == RRStatusTy::RRRecording; }
|
|
bool isReplaying() const { return Status == RRStatusTy::RRReplaying; }
|
|
bool isRecordingOrReplaying() const {
|
|
return (Status != RRStatusTy::RRDeactivated);
|
|
}
|
|
void setStatus(RRStatusTy Status) { this->Status = Status; }
|
|
bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
|
|
void addEntry(const char *Name, uint64_t Size, void *Addr) {
|
|
GlobalEntries.emplace_back(GlobalEntry{Name, Size, Addr});
|
|
}
|
|
|
|
void saveImage(const char *Name, const DeviceImageTy &Image) {
|
|
SmallString<128> ImageName = {Name, ".image"};
|
|
std::error_code EC;
|
|
raw_fd_ostream OS(ImageName, EC);
|
|
if (EC)
|
|
report_fatal_error("Error saving image : " + StringRef(EC.message()));
|
|
if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
|
|
size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd,
|
|
TgtImageBitcode->ImageStart);
|
|
MemoryBufferRef MBR = MemoryBufferRef(
|
|
StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
|
|
OS << MBR.getBuffer();
|
|
} else {
|
|
OS << Image.getMemoryBuffer().getBuffer();
|
|
}
|
|
OS.close();
|
|
}
|
|
|
|
void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
|
|
int32_t Size = 0;
|
|
|
|
for (auto &OffloadEntry : GlobalEntries) {
|
|
if (!OffloadEntry.Size)
|
|
continue;
|
|
// Get the total size of the string and entry including the null byte.
|
|
Size += std::strlen(OffloadEntry.Name) + 1 + sizeof(uint32_t) +
|
|
OffloadEntry.Size;
|
|
}
|
|
|
|
ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
|
|
WritableMemoryBuffer::getNewUninitMemBuffer(Size);
|
|
if (!GlobalsMB)
|
|
report_fatal_error("Error creating MemoryBuffer for globals memory");
|
|
|
|
void *BufferPtr = GlobalsMB.get()->getBufferStart();
|
|
for (auto &OffloadEntry : GlobalEntries) {
|
|
if (!OffloadEntry.Size)
|
|
continue;
|
|
|
|
int32_t NameLength = std::strlen(OffloadEntry.Name) + 1;
|
|
memcpy(BufferPtr, OffloadEntry.Name, NameLength);
|
|
BufferPtr = utils::advancePtr(BufferPtr, NameLength);
|
|
|
|
*((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
|
|
BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
|
|
|
|
auto Err = Plugin::success();
|
|
{
|
|
if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.Addr,
|
|
OffloadEntry.Size, nullptr))
|
|
report_fatal_error("Error retrieving data for global");
|
|
}
|
|
if (Err)
|
|
report_fatal_error("Error retrieving data for global");
|
|
BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size);
|
|
}
|
|
assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
|
|
"Buffer over/under-filled.");
|
|
assert(Size == utils::getPtrDiff(BufferPtr,
|
|
GlobalsMB->get()->getBufferStart()) &&
|
|
"Buffer size mismatch");
|
|
|
|
StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
|
|
std::error_code EC;
|
|
raw_fd_ostream OS(Filename, EC);
|
|
OS << GlobalsMemory;
|
|
OS.close();
|
|
}
|
|
|
|
void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams,
|
|
int32_t NumArgs, uint64_t NumTeamsClause,
|
|
uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
|
|
json::Object JsonKernelInfo;
|
|
JsonKernelInfo["Name"] = Name;
|
|
JsonKernelInfo["NumArgs"] = NumArgs;
|
|
JsonKernelInfo["NumTeamsClause"] = NumTeamsClause;
|
|
JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause;
|
|
JsonKernelInfo["LoopTripCount"] = LoopTripCount;
|
|
JsonKernelInfo["DeviceMemorySize"] = MemorySize;
|
|
JsonKernelInfo["DeviceId"] = Device->getDeviceId();
|
|
JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;
|
|
|
|
json::Array JsonArgPtrs;
|
|
for (int I = 0; I < NumArgs; ++I)
|
|
JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]);
|
|
JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
|
|
|
|
json::Array JsonArgOffsets;
|
|
for (int I = 0; I < NumArgs; ++I)
|
|
JsonArgOffsets.push_back(0);
|
|
JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
|
|
|
|
SmallString<128> JsonFilename = {Name, ".json"};
|
|
std::error_code EC;
|
|
raw_fd_ostream JsonOS(JsonFilename.str(), EC);
|
|
if (EC)
|
|
report_fatal_error("Error saving kernel json file : " +
|
|
StringRef(EC.message()));
|
|
JsonOS << json::Value(std::move(JsonKernelInfo));
|
|
JsonOS.close();
|
|
}
|
|
|
|
void saveKernelInput(const char *Name, DeviceImageTy &Image) {
|
|
SmallString<128> GlobalsFilename = {Name, ".globals"};
|
|
dumpGlobals(GlobalsFilename, Image);
|
|
|
|
SmallString<128> MemoryFilename = {Name, ".memory"};
|
|
dumpDeviceMemory(MemoryFilename);
|
|
}
|
|
|
|
void saveKernelOutputInfo(const char *Name) {
|
|
SmallString<128> OutputFilename = {
|
|
Name, (isRecording() ? ".original.output" : ".replay.output")};
|
|
dumpDeviceMemory(OutputFilename);
|
|
}
|
|
|
|
void *alloc(uint64_t Size) {
|
|
assert(MemoryStart && "Expected memory has been pre-allocated");
|
|
void *Alloc = nullptr;
|
|
constexpr int Alignment = 16;
|
|
// Assumes alignment is a power of 2.
|
|
int64_t AlignedSize = (Size + (Alignment - 1)) & (~(Alignment - 1));
|
|
std::lock_guard<std::mutex> LG(AllocationLock);
|
|
Alloc = MemoryPtr;
|
|
MemoryPtr = (char *)MemoryPtr + AlignedSize;
|
|
MemorySize += AlignedSize;
|
|
DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc));
|
|
return Alloc;
|
|
}
|
|
|
|
Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
|
|
RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) {
|
|
this->Device = Device;
|
|
this->Status = Status;
|
|
this->ReplaySaveOutput = SaveOutput;
|
|
|
|
if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
|
|
return Err;
|
|
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
|
|
"Record Replay Initialized (%p)"
|
|
" as starting address, %lu Memory Size"
|
|
" and set on status %s\n",
|
|
MemoryStart, TotalSize,
|
|
Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
|
|
|
|
// Tell the user to offset pointer arguments as the memory allocation does
|
|
// not match.
|
|
ReqPtrArgOffset = MemoryOffset;
|
|
return Plugin::success();
|
|
}
|
|
|
|
void deinit() {
|
|
if (UsedVAMap) {
|
|
if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
|
|
report_fatal_error("Error on releasing virtual memory space");
|
|
} else {
|
|
Device->free(MemoryStart);
|
|
}
|
|
}
|
|
};
|
|
} // namespace llvm::omp::target::plugin
|
|
|
|
// Extract the mapping of host function pointers to device function pointers
|
|
// from the entry table. Functions marked as 'indirect' in OpenMP will have
|
|
// offloading entries generated for them which map the host's function pointer
|
|
// to a global containing the corresponding function pointer on the device.
|
|
static Expected<std::pair<void *, uint64_t>>
|
|
setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device,
|
|
DeviceImageTy &Image) {
|
|
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
|
|
|
|
llvm::ArrayRef<llvm::offloading::EntryTy> Entries(
|
|
Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd);
|
|
llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
|
|
for (const auto &Entry : Entries) {
|
|
if (Entry.Kind != object::OffloadKind::OFK_OpenMP || Entry.Size == 0 ||
|
|
!(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT))
|
|
continue;
|
|
|
|
assert(Entry.Size == sizeof(void *) && "Global not a function pointer?");
|
|
auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
|
|
|
|
GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size);
|
|
if (auto Err =
|
|
Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
|
|
return std::move(Err);
|
|
|
|
HstPtr = Entry.Address;
|
|
if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(),
|
|
Entry.Size, nullptr))
|
|
return std::move(Err);
|
|
}
|
|
|
|
// If we do not have any indirect globals we exit early.
|
|
if (IndirectCallTable.empty())
|
|
return std::pair{nullptr, 0};
|
|
|
|
// Sort the array to allow for more efficient lookup of device pointers.
|
|
llvm::sort(IndirectCallTable,
|
|
[](const auto &x, const auto &y) { return x.first < y.first; });
|
|
|
|
uint64_t TableSize =
|
|
IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
|
|
void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE);
|
|
if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(),
|
|
TableSize, nullptr))
|
|
return std::move(Err);
|
|
return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
|
|
}
|
|
|
|
AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
|
|
__tgt_async_info *AsyncInfoPtr)
|
|
: Device(Device),
|
|
AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {}
|
|
|
|
void AsyncInfoWrapperTy::finalize(Error &Err) {
|
|
assert(AsyncInfoPtr && "AsyncInfoWrapperTy already finalized");
|
|
|
|
// If we used a local async info object we want synchronous behavior. In that
|
|
// case, and assuming the current status code is correct, we will synchronize
|
|
// explicitly when the object is deleted. Update the error with the result of
|
|
// the synchronize operation.
|
|
if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err)
|
|
Err = Device.synchronize(&LocalAsyncInfo);
|
|
|
|
// Invalidate the wrapper object.
|
|
AsyncInfoPtr = nullptr;
|
|
}
|
|
|
|
Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
|
|
DeviceImageTy &Image) {
|
|
|
|
ImagePtr = &Image;
|
|
|
|
// Retrieve kernel environment object for the kernel.
|
|
GlobalTy KernelEnv(std::string(Name) + "_kernel_environment",
|
|
sizeof(KernelEnvironment), &KernelEnvironment);
|
|
GenericGlobalHandlerTy &GHandler = GenericDevice.Plugin.getGlobalHandler();
|
|
if (auto Err =
|
|
GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) {
|
|
[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
|
|
DP("Failed to read kernel environment for '%s': %s\n"
|
|
"Using default SPMD (2) execution mode\n",
|
|
Name, ErrStr.data());
|
|
assert(KernelEnvironment.Configuration.ReductionDataSize == 0 &&
|
|
"Default initialization failed.");
|
|
IsBareKernel = true;
|
|
}
|
|
|
|
// Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
|
|
MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0
|
|
? std::min(KernelEnvironment.Configuration.MaxThreads,
|
|
int32_t(GenericDevice.getThreadLimit()))
|
|
: GenericDevice.getThreadLimit();
|
|
|
|
// Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
|
|
PreferredNumThreads =
|
|
KernelEnvironment.Configuration.MinThreads > 0
|
|
? std::max(KernelEnvironment.Configuration.MinThreads,
|
|
int32_t(GenericDevice.getDefaultNumThreads()))
|
|
: GenericDevice.getDefaultNumThreads();
|
|
|
|
return initImpl(GenericDevice, Image);
|
|
}
|
|
|
|
Expected<KernelLaunchEnvironmentTy *>
|
|
GenericKernelTy::getKernelLaunchEnvironment(
|
|
GenericDeviceTy &GenericDevice, uint32_t Version,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
|
// Ctor/Dtor have no arguments, replaying uses the original kernel launch
|
|
// environment. Older versions of the compiler do not generate a kernel
|
|
// launch environment.
|
|
if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
|
|
Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
|
|
return nullptr;
|
|
|
|
if (!KernelEnvironment.Configuration.ReductionDataSize ||
|
|
!KernelEnvironment.Configuration.ReductionBufferLength)
|
|
return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
|
|
|
|
// TODO: Check if the kernel needs a launch environment.
|
|
auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
|
|
/*HostPtr=*/nullptr,
|
|
TargetAllocTy::TARGET_ALLOC_DEVICE);
|
|
if (!AllocOrErr)
|
|
return AllocOrErr.takeError();
|
|
|
|
// Remember to free the memory later.
|
|
AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
|
|
|
|
/// Use the KLE in the __tgt_async_info to ensure a stable address for the
|
|
/// async data transfer.
|
|
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
|
|
LocalKLE = KernelLaunchEnvironment;
|
|
{
|
|
auto AllocOrErr = GenericDevice.dataAlloc(
|
|
KernelEnvironment.Configuration.ReductionDataSize *
|
|
KernelEnvironment.Configuration.ReductionBufferLength,
|
|
/*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
|
|
if (!AllocOrErr)
|
|
return AllocOrErr.takeError();
|
|
LocalKLE.ReductionBuffer = *AllocOrErr;
|
|
// Remember to free the memory later.
|
|
AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
|
|
}
|
|
|
|
INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
|
|
"Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
|
|
", Size=%" PRId64 ", Name=KernelLaunchEnv\n",
|
|
DPxPTR(&LocalKLE), DPxPTR(*AllocOrErr),
|
|
sizeof(KernelLaunchEnvironmentTy));
|
|
|
|
auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
|
|
sizeof(KernelLaunchEnvironmentTy),
|
|
AsyncInfoWrapper);
|
|
if (Err)
|
|
return Err;
|
|
return static_cast<KernelLaunchEnvironmentTy *>(*AllocOrErr);
|
|
}
|
|
|
|
Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice,
|
|
KernelArgsTy &KernelArgs,
|
|
uint32_t NumThreads[3],
|
|
uint32_t NumBlocks[3]) const {
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
|
|
"Launching kernel %s with [%u,%u,%u] blocks and [%u,%u,%u] threads in "
|
|
"%s mode\n",
|
|
getName(), NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0],
|
|
NumThreads[1], NumThreads[2], getExecutionModeName());
|
|
return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads,
|
|
NumBlocks);
|
|
}
|
|
|
|
Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
|
KernelArgsTy &KernelArgs,
|
|
uint32_t NumThreads[3],
|
|
uint32_t NumBlocks[3]) const {
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
|
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
|
llvm::SmallVector<void *, 16> Args;
|
|
llvm::SmallVector<void *, 16> Ptrs;
|
|
|
|
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
|
|
GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
|
|
if (!KernelLaunchEnvOrErr)
|
|
return KernelLaunchEnvOrErr.takeError();
|
|
|
|
KernelLaunchParamsTy LaunchParams;
|
|
|
|
// Kernel languages don't use indirection.
|
|
if (KernelArgs.Flags.IsCUDA) {
|
|
LaunchParams =
|
|
*reinterpret_cast<KernelLaunchParamsTy *>(KernelArgs.ArgPtrs);
|
|
} else {
|
|
LaunchParams =
|
|
prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
|
|
Args, Ptrs, *KernelLaunchEnvOrErr);
|
|
}
|
|
|
|
uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
|
|
KernelArgs.ThreadLimit[1],
|
|
KernelArgs.ThreadLimit[2]};
|
|
uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
|
|
KernelArgs.NumTeams[2]};
|
|
if (!IsBareKernel) {
|
|
NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
|
|
NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
|
|
NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
|
|
}
|
|
|
|
// Record the kernel description after we modified the argument count and num
|
|
// blocks/threads.
|
|
RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
|
|
if (RecordReplay.isRecording()) {
|
|
RecordReplay.saveImage(getName(), getImage());
|
|
RecordReplay.saveKernelInput(getName(), getImage());
|
|
RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
|
|
NumBlocks[0], NumThreads[0],
|
|
KernelArgs.Tripcount);
|
|
}
|
|
|
|
if (auto Err =
|
|
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
|
|
return Err;
|
|
|
|
return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
|
|
LaunchParams, AsyncInfoWrapper);
|
|
}
|
|
|
|
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
|
|
GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
|
|
uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
|
|
llvm::SmallVectorImpl<void *> &Ptrs,
|
|
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const {
|
|
uint32_t KLEOffset = !!KernelLaunchEnvironment;
|
|
NumArgs += KLEOffset;
|
|
|
|
if (NumArgs == 0)
|
|
return KernelLaunchParamsTy{};
|
|
|
|
Args.resize(NumArgs);
|
|
Ptrs.resize(NumArgs);
|
|
|
|
if (KernelLaunchEnvironment) {
|
|
Args[0] = KernelLaunchEnvironment;
|
|
Ptrs[0] = &Args[0];
|
|
}
|
|
|
|
for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
|
|
Args[I] =
|
|
(void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
|
|
Ptrs[I] = &Args[I];
|
|
}
|
|
return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
|
|
}
|
|
|
|
uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
|
|
uint32_t ThreadLimitClause[3]) const {
|
|
assert(!IsBareKernel && "bare kernel should not call this function");
|
|
|
|
assert(ThreadLimitClause[1] == 1 && ThreadLimitClause[2] == 1 &&
|
|
"Multi dimensional launch not supported yet.");
|
|
|
|
if (ThreadLimitClause[0] > 0 && isGenericMode())
|
|
ThreadLimitClause[0] += GenericDevice.getWarpSize();
|
|
|
|
return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0)
|
|
? ThreadLimitClause[0]
|
|
: PreferredNumThreads);
|
|
}
|
|
|
|
uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
|
|
uint32_t NumTeamsClause[3],
|
|
uint64_t LoopTripCount,
|
|
uint32_t &NumThreads,
|
|
bool IsNumThreadsFromUser) const {
|
|
assert(!IsBareKernel && "bare kernel should not call this function");
|
|
|
|
assert(NumTeamsClause[1] == 1 && NumTeamsClause[2] == 1 &&
|
|
"Multi dimensional launch not supported yet.");
|
|
|
|
if (NumTeamsClause[0] > 0) {
|
|
// TODO: We need to honor any value and consequently allow more than the
|
|
// block limit. For this we might need to start multiple kernels or let the
|
|
// blocks start again until the requested number has been started.
|
|
return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
|
|
}
|
|
|
|
uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
|
|
uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
|
|
if (LoopTripCount > 0) {
|
|
if (isSPMDMode()) {
|
|
// We have a combined construct, i.e. `target teams distribute
|
|
// parallel for [simd]`. We launch so many teams so that each thread
|
|
// will execute one iteration of the loop; rounded up to the nearest
|
|
// integer. However, if that results in too few teams, we artificially
|
|
// reduce the thread count per team to increase the outer parallelism.
|
|
auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop();
|
|
MinThreads = std::min(MinThreads, NumThreads);
|
|
|
|
// Honor the thread_limit clause; only lower the number of threads.
|
|
[[maybe_unused]] auto OldNumThreads = NumThreads;
|
|
if (LoopTripCount >= DefaultNumBlocks * NumThreads ||
|
|
IsNumThreadsFromUser) {
|
|
// Enough parallelism for teams and threads.
|
|
TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
|
|
assert(IsNumThreadsFromUser ||
|
|
TripCountNumBlocks >= DefaultNumBlocks &&
|
|
"Expected sufficient outer parallelism.");
|
|
} else if (LoopTripCount >= DefaultNumBlocks * MinThreads) {
|
|
// Enough parallelism for teams, limit threads.
|
|
|
|
// This case is hard; for now, we force "full warps":
|
|
// First, compute a thread count assuming DefaultNumBlocks.
|
|
auto NumThreadsDefaultBlocks =
|
|
(LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks;
|
|
// Now get a power of two that is larger or equal.
|
|
auto NumThreadsDefaultBlocksP2 =
|
|
llvm::PowerOf2Ceil(NumThreadsDefaultBlocks);
|
|
// Do not increase a thread limit given be the user.
|
|
NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2));
|
|
assert(NumThreads >= MinThreads &&
|
|
"Expected sufficient inner parallelism.");
|
|
TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
|
|
} else {
|
|
// Not enough parallelism for teams and threads, limit both.
|
|
NumThreads = std::min(NumThreads, MinThreads);
|
|
TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
|
|
}
|
|
|
|
assert(NumThreads * TripCountNumBlocks >= LoopTripCount &&
|
|
"Expected sufficient parallelism");
|
|
assert(OldNumThreads >= NumThreads &&
|
|
"Number of threads cannot be increased!");
|
|
} else {
|
|
assert((isGenericMode() || isGenericSPMDMode()) &&
|
|
"Unexpected execution mode!");
|
|
// If we reach this point, then we have a non-combined construct, i.e.
|
|
// `teams distribute` with a nested `parallel for` and each team is
|
|
// assigned one iteration of the `distribute` loop. E.g.:
|
|
//
|
|
// #pragma omp target teams distribute
|
|
// for(...loop_tripcount...) {
|
|
// #pragma omp parallel for
|
|
// for(...) {}
|
|
// }
|
|
//
|
|
// Threads within a team will execute the iterations of the `parallel`
|
|
// loop.
|
|
TripCountNumBlocks = LoopTripCount;
|
|
}
|
|
}
|
|
|
|
uint32_t PreferredNumBlocks = TripCountNumBlocks;
|
|
// If the loops are long running we rather reuse blocks than spawn too many.
|
|
if (GenericDevice.getReuseBlocksForHighTripCount())
|
|
PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
|
|
return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
|
|
}
|
|
|
|
GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
|
|
int32_t NumDevices,
|
|
const llvm::omp::GV &OMPGridValues)
|
|
: Plugin(Plugin), MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT"),
|
|
OMP_NumTeams("OMP_NUM_TEAMS"),
|
|
OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
|
|
OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
|
|
OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
|
|
// Do not initialize the following two envars since they depend on the
|
|
// device initialization. These cannot be consulted until the device is
|
|
// initialized correctly. We initialize them in GenericDeviceTy::init().
|
|
OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
|
|
// By default, the initial number of streams and events is 1.
|
|
OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
|
|
OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
|
|
DeviceId(DeviceId), GridValues(OMPGridValues),
|
|
PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
|
|
PinnedAllocs(*this), RPCServer(nullptr) {
|
|
#ifdef OMPT_SUPPORT
|
|
OmptInitialized.store(false);
|
|
// Bind the callbacks to this device's member functions
|
|
#define bindOmptCallback(Name, Type, Code) \
|
|
if (ompt::Initialized && ompt::lookupCallbackByCode) { \
|
|
ompt::lookupCallbackByCode((ompt_callbacks_t)(Code), \
|
|
((ompt_callback_t *)&(Name##_fn))); \
|
|
DP("OMPT: class bound %s=%p\n", #Name, ((void *)(uint64_t)Name##_fn)); \
|
|
}
|
|
|
|
FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback);
|
|
#undef bindOmptCallback
|
|
|
|
#endif
|
|
}
|
|
|
|
Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
|
|
if (auto Err = initImpl(Plugin))
|
|
return Err;
|
|
|
|
#ifdef OMPT_SUPPORT
|
|
if (ompt::Initialized) {
|
|
bool ExpectedStatus = false;
|
|
if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true))
|
|
performOmptCallback(device_initialize, Plugin.getUserId(DeviceId),
|
|
/*type=*/getComputeUnitKind().c_str(),
|
|
/*device=*/reinterpret_cast<ompt_device_t *>(this),
|
|
/*lookup=*/ompt::lookupCallbackByName,
|
|
/*documentation=*/nullptr);
|
|
}
|
|
#endif
|
|
|
|
// Read and reinitialize the envars that depend on the device initialization.
|
|
// Notice these two envars may change the stack size and heap size of the
|
|
// device, so they need the device properly initialized.
|
|
auto StackSizeEnvarOrErr = UInt64Envar::create(
|
|
"LIBOMPTARGET_STACK_SIZE",
|
|
[this](uint64_t &V) -> Error { return getDeviceStackSize(V); },
|
|
[this](uint64_t V) -> Error { return setDeviceStackSize(V); });
|
|
if (!StackSizeEnvarOrErr)
|
|
return StackSizeEnvarOrErr.takeError();
|
|
OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
|
|
|
|
auto HeapSizeEnvarOrErr = UInt64Envar::create(
|
|
"LIBOMPTARGET_HEAP_SIZE",
|
|
[this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
|
|
[this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
|
|
if (!HeapSizeEnvarOrErr)
|
|
return HeapSizeEnvarOrErr.takeError();
|
|
OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
|
|
|
|
// Update the maximum number of teams and threads after the device
|
|
// initialization sets the corresponding hardware limit.
|
|
if (OMP_NumTeams > 0)
|
|
GridValues.GV_Max_Teams =
|
|
std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams));
|
|
|
|
if (OMP_TeamsThreadLimit > 0)
|
|
GridValues.GV_Max_WG_Size =
|
|
std::min(GridValues.GV_Max_WG_Size, uint32_t(OMP_TeamsThreadLimit));
|
|
|
|
// Enable the memory manager if required.
|
|
auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
|
|
if (EnableMM)
|
|
MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
|
|
for (DeviceImageTy *Image : LoadedImages)
|
|
if (auto Err = callGlobalDestructors(Plugin, *Image))
|
|
return Err;
|
|
|
|
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
|
|
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
|
|
for (auto *Image : LoadedImages) {
|
|
DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
|
|
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
|
|
sizeof(DeviceMemoryPoolTrackingTy),
|
|
&ImageDeviceMemoryPoolTracking);
|
|
if (auto Err =
|
|
GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
|
|
consumeError(std::move(Err));
|
|
continue;
|
|
}
|
|
DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
|
|
}
|
|
|
|
// TODO: Write this by default into a file.
|
|
printf("\n\n|-----------------------\n"
|
|
"| Device memory tracker:\n"
|
|
"|-----------------------\n"
|
|
"| #Allocations: %lu\n"
|
|
"| Byes allocated: %lu\n"
|
|
"| Minimal allocation: %lu\n"
|
|
"| Maximal allocation: %lu\n"
|
|
"|-----------------------\n\n\n",
|
|
DeviceMemoryPoolTracking.NumAllocations,
|
|
DeviceMemoryPoolTracking.AllocationTotal,
|
|
DeviceMemoryPoolTracking.AllocationMin,
|
|
DeviceMemoryPoolTracking.AllocationMax);
|
|
}
|
|
|
|
for (auto *Image : LoadedImages) {
|
|
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
|
|
if (!Handler.hasProfilingGlobals(*this, *Image))
|
|
continue;
|
|
|
|
GPUProfGlobals profdata;
|
|
auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
|
|
if (!ProfOrErr)
|
|
return ProfOrErr.takeError();
|
|
|
|
// Dump out profdata
|
|
if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
|
|
uint32_t(DeviceDebugKind::PGODump))
|
|
ProfOrErr->dump();
|
|
|
|
// Write data to profiling file
|
|
if (auto Err = ProfOrErr->write())
|
|
return Err;
|
|
}
|
|
|
|
// Delete the memory manager before deinitializing the device. Otherwise,
|
|
// we may delete device allocations after the device is deinitialized.
|
|
if (MemoryManager)
|
|
delete MemoryManager;
|
|
MemoryManager = nullptr;
|
|
|
|
RecordReplayTy &RecordReplay = Plugin.getRecordReplay();
|
|
if (RecordReplay.isRecordingOrReplaying())
|
|
RecordReplay.deinit();
|
|
|
|
if (RPCServer)
|
|
if (auto Err = RPCServer->deinitDevice(*this))
|
|
return Err;
|
|
|
|
#ifdef OMPT_SUPPORT
|
|
if (ompt::Initialized) {
|
|
bool ExpectedStatus = true;
|
|
if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false))
|
|
performOmptCallback(device_finalize, Plugin.getUserId(DeviceId));
|
|
}
|
|
#endif
|
|
|
|
return deinitImpl();
|
|
}
|
|
Expected<DeviceImageTy *>
|
|
GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
|
|
const __tgt_device_image *InputTgtImage) {
|
|
assert(InputTgtImage && "Expected non-null target image");
|
|
DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage->ImageStart));
|
|
|
|
auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this);
|
|
if (!PostJITImageOrErr) {
|
|
auto Err = PostJITImageOrErr.takeError();
|
|
REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage,
|
|
DeviceId, toString(std::move(Err)).data());
|
|
return nullptr;
|
|
}
|
|
|
|
// Load the binary and allocate the image object. Use the next available id
|
|
// for the image id, which is the number of previously loaded images.
|
|
auto ImageOrErr =
|
|
loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size());
|
|
if (!ImageOrErr)
|
|
return ImageOrErr.takeError();
|
|
|
|
DeviceImageTy *Image = *ImageOrErr;
|
|
assert(Image != nullptr && "Invalid image");
|
|
if (InputTgtImage != PostJITImageOrErr.get())
|
|
Image->setTgtImageBitcode(InputTgtImage);
|
|
|
|
// Add the image to list.
|
|
LoadedImages.push_back(Image);
|
|
|
|
// Setup the device environment if needed.
|
|
if (auto Err = setupDeviceEnvironment(Plugin, *Image))
|
|
return std::move(Err);
|
|
|
|
// Setup the global device memory pool if needed.
|
|
if (!Plugin.getRecordReplay().isReplaying() &&
|
|
shouldSetupDeviceMemoryPool()) {
|
|
uint64_t HeapSize;
|
|
auto SizeOrErr = getDeviceHeapSize(HeapSize);
|
|
if (SizeOrErr) {
|
|
REPORT("No global device memory pool due to error: %s\n",
|
|
toString(std::move(SizeOrErr)).data());
|
|
} else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
|
|
return std::move(Err);
|
|
}
|
|
|
|
if (auto Err = setupRPCServer(Plugin, *Image))
|
|
return std::move(Err);
|
|
|
|
#ifdef OMPT_SUPPORT
|
|
if (ompt::Initialized) {
|
|
size_t Bytes =
|
|
utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
|
|
performOmptCallback(
|
|
device_load, Plugin.getUserId(DeviceId),
|
|
/*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
|
|
/*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart,
|
|
/*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
|
|
}
|
|
#endif
|
|
|
|
// Call any global constructors present on the device.
|
|
if (auto Err = callGlobalConstructors(Plugin, *Image))
|
|
return std::move(Err);
|
|
|
|
// Return the pointer to the table of entries.
|
|
return Image;
|
|
}
|
|
|
|
Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
|
|
DeviceImageTy &Image) {
|
|
// There are some plugins that do not need this step.
|
|
if (!shouldSetupDeviceEnvironment())
|
|
return Plugin::success();
|
|
|
|
// Obtain a table mapping host function pointers to device function pointers.
|
|
auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image);
|
|
if (!CallTablePairOrErr)
|
|
return CallTablePairOrErr.takeError();
|
|
|
|
DeviceEnvironmentTy DeviceEnvironment;
|
|
DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind;
|
|
DeviceEnvironment.NumDevices = Plugin.getNumDevices();
|
|
// TODO: The device ID used here is not the real device ID used by OpenMP.
|
|
DeviceEnvironment.DeviceNum = DeviceId;
|
|
DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
|
|
DeviceEnvironment.ClockFrequency = getClockFrequency();
|
|
DeviceEnvironment.IndirectCallTable =
|
|
reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
|
|
DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
|
|
DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
|
|
|
|
// Create the metainfo of the device environment global.
|
|
GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
|
|
sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
|
|
|
|
// Write device environment values to the device.
|
|
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
|
|
if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) {
|
|
DP("Missing symbol %s, continue execution anyway.\n",
|
|
DevEnvGlobal.getName().data());
|
|
consumeError(std::move(Err));
|
|
}
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
|
|
DeviceImageTy &Image,
|
|
uint64_t PoolSize) {
|
|
// Free the old pool, if any.
|
|
if (DeviceMemoryPool.Ptr) {
|
|
if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
|
|
TargetAllocTy::TARGET_ALLOC_DEVICE))
|
|
return Err;
|
|
}
|
|
|
|
DeviceMemoryPool.Size = PoolSize;
|
|
auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
|
|
TargetAllocTy::TARGET_ALLOC_DEVICE);
|
|
if (AllocOrErr) {
|
|
DeviceMemoryPool.Ptr = *AllocOrErr;
|
|
} else {
|
|
auto Err = AllocOrErr.takeError();
|
|
REPORT("Failure to allocate device memory for global memory pool: %s\n",
|
|
toString(std::move(Err)).data());
|
|
DeviceMemoryPool.Ptr = nullptr;
|
|
DeviceMemoryPool.Size = 0;
|
|
}
|
|
|
|
// Create the metainfo of the device environment global.
|
|
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
|
|
if (!GHandler.isSymbolInImage(*this, Image,
|
|
"__omp_rtl_device_memory_pool_tracker")) {
|
|
DP("Skip the memory pool as there is no tracker symbol in the image.");
|
|
return Error::success();
|
|
}
|
|
|
|
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
|
|
sizeof(DeviceMemoryPoolTrackingTy),
|
|
&DeviceMemoryPoolTracking);
|
|
if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
|
|
return Err;
|
|
|
|
// Create the metainfo of the device environment global.
|
|
GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
|
|
sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
|
|
|
|
// Write device environment values to the device.
|
|
return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
|
|
}
|
|
|
|
Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
|
|
DeviceImageTy &Image) {
|
|
// The plugin either does not need an RPC server or it is unavailable.
|
|
if (!shouldSetupRPCServer())
|
|
return Plugin::success();
|
|
|
|
// Check if this device needs to run an RPC server.
|
|
RPCServerTy &Server = Plugin.getRPCServer();
|
|
auto UsingOrErr =
|
|
Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image);
|
|
if (!UsingOrErr)
|
|
return UsingOrErr.takeError();
|
|
|
|
if (!UsingOrErr.get())
|
|
return Plugin::success();
|
|
|
|
if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
|
|
return Err;
|
|
|
|
if (auto Err = Server.startThread())
|
|
return Err;
|
|
|
|
RPCServer = &Server;
|
|
DP("Running an RPC server on device %d\n", getDeviceId());
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
|
|
size_t Size, bool ExternallyLocked) {
|
|
// Insert the new entry into the map.
|
|
auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size, ExternallyLocked});
|
|
if (!Res.second)
|
|
return Plugin::error("Cannot insert locked buffer entry");
|
|
|
|
// Check whether the next entry overlaps with the inserted entry.
|
|
auto It = std::next(Res.first);
|
|
if (It == Allocs.end())
|
|
return Plugin::success();
|
|
|
|
const EntryTy *NextEntry = &(*It);
|
|
if (intersects(NextEntry->HstPtr, NextEntry->Size, HstPtr, Size))
|
|
return Plugin::error("Partial overlapping not allowed in locked buffers");
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error PinnedAllocationMapTy::eraseEntry(const EntryTy &Entry) {
|
|
// Erase the existing entry. Notice this requires an additional map lookup,
|
|
// but this should not be a performance issue. Using iterators would make
|
|
// the code more difficult to read.
|
|
size_t Erased = Allocs.erase({Entry.HstPtr});
|
|
if (!Erased)
|
|
return Plugin::error("Cannot erase locked buffer entry");
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error PinnedAllocationMapTy::registerEntryUse(const EntryTy &Entry,
|
|
void *HstPtr, size_t Size) {
|
|
if (!contains(Entry.HstPtr, Entry.Size, HstPtr, Size))
|
|
return Plugin::error("Partial overlapping not allowed in locked buffers");
|
|
|
|
++Entry.References;
|
|
return Plugin::success();
|
|
}
|
|
|
|
Expected<bool> PinnedAllocationMapTy::unregisterEntryUse(const EntryTy &Entry) {
|
|
if (Entry.References == 0)
|
|
return Plugin::error("Invalid number of references");
|
|
|
|
// Return whether this was the last user.
|
|
return (--Entry.References == 0);
|
|
}
|
|
|
|
Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr,
|
|
void *DevAccessiblePtr,
|
|
size_t Size) {
|
|
assert(HstPtr && "Invalid pointer");
|
|
assert(DevAccessiblePtr && "Invalid pointer");
|
|
assert(Size && "Invalid size");
|
|
|
|
std::lock_guard<std::shared_mutex> Lock(Mutex);
|
|
|
|
// No pinned allocation should intersect.
|
|
const EntryTy *Entry = findIntersecting(HstPtr);
|
|
if (Entry)
|
|
return Plugin::error("Cannot insert entry due to an existing one");
|
|
|
|
// Now insert the new entry.
|
|
return insertEntry(HstPtr, DevAccessiblePtr, Size);
|
|
}
|
|
|
|
Error PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr) {
|
|
assert(HstPtr && "Invalid pointer");
|
|
|
|
std::lock_guard<std::shared_mutex> Lock(Mutex);
|
|
|
|
const EntryTy *Entry = findIntersecting(HstPtr);
|
|
if (!Entry)
|
|
return Plugin::error("Cannot find locked buffer");
|
|
|
|
// The address in the entry should be the same we are unregistering.
|
|
if (Entry->HstPtr != HstPtr)
|
|
return Plugin::error("Unexpected host pointer in locked buffer entry");
|
|
|
|
// Unregister from the entry.
|
|
auto LastUseOrErr = unregisterEntryUse(*Entry);
|
|
if (!LastUseOrErr)
|
|
return LastUseOrErr.takeError();
|
|
|
|
// There should be no other references to the pinned allocation.
|
|
if (!(*LastUseOrErr))
|
|
return Plugin::error("The locked buffer is still being used");
|
|
|
|
// Erase the entry from the map.
|
|
return eraseEntry(*Entry);
|
|
}
|
|
|
|
Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
|
|
size_t Size) {
|
|
assert(HstPtr && "Invalid pointer");
|
|
assert(Size && "Invalid size");
|
|
|
|
std::lock_guard<std::shared_mutex> Lock(Mutex);
|
|
|
|
const EntryTy *Entry = findIntersecting(HstPtr);
|
|
|
|
if (Entry) {
|
|
// An already registered intersecting buffer was found. Register a new use.
|
|
if (auto Err = registerEntryUse(*Entry, HstPtr, Size))
|
|
return std::move(Err);
|
|
|
|
// Return the device accessible pointer with the correct offset.
|
|
return utils::advancePtr(Entry->DevAccessiblePtr,
|
|
utils::getPtrDiff(HstPtr, Entry->HstPtr));
|
|
}
|
|
|
|
// No intersecting registered allocation found in the map. First, lock the
|
|
// host buffer and retrieve the device accessible pointer.
|
|
auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
|
|
if (!DevAccessiblePtrOrErr)
|
|
return DevAccessiblePtrOrErr.takeError();
|
|
|
|
// Now insert the new entry into the map.
|
|
if (auto Err = insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size))
|
|
return std::move(Err);
|
|
|
|
// Return the device accessible pointer.
|
|
return *DevAccessiblePtrOrErr;
|
|
}
|
|
|
|
Error PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr) {
|
|
assert(HstPtr && "Invalid pointer");
|
|
|
|
std::lock_guard<std::shared_mutex> Lock(Mutex);
|
|
|
|
const EntryTy *Entry = findIntersecting(HstPtr);
|
|
if (!Entry)
|
|
return Plugin::error("Cannot find locked buffer");
|
|
|
|
// Unregister from the locked buffer. No need to do anything if there are
|
|
// others using the allocation.
|
|
auto LastUseOrErr = unregisterEntryUse(*Entry);
|
|
if (!LastUseOrErr)
|
|
return LastUseOrErr.takeError();
|
|
|
|
// No need to do anything if there are others using the allocation.
|
|
if (!(*LastUseOrErr))
|
|
return Plugin::success();
|
|
|
|
// This was the last user of the allocation. Unlock the original locked buffer
|
|
// if it was locked by the plugin. Do not unlock it if it was locked by an
|
|
// external entity. Unlock the buffer using the host pointer of the entry.
|
|
if (!Entry->ExternallyLocked)
|
|
if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
|
|
return Err;
|
|
|
|
// Erase the entry from the map.
|
|
return eraseEntry(*Entry);
|
|
}
|
|
|
|
Error PinnedAllocationMapTy::lockMappedHostBuffer(void *HstPtr, size_t Size) {
|
|
assert(HstPtr && "Invalid pointer");
|
|
assert(Size && "Invalid size");
|
|
|
|
std::lock_guard<std::shared_mutex> Lock(Mutex);
|
|
|
|
// If previously registered, just register a new user on the entry.
|
|
const EntryTy *Entry = findIntersecting(HstPtr);
|
|
if (Entry)
|
|
return registerEntryUse(*Entry, HstPtr, Size);
|
|
|
|
size_t BaseSize;
|
|
void *BaseHstPtr, *BaseDevAccessiblePtr;
|
|
|
|
// Check if it was externally pinned by a vendor-specific API.
|
|
auto IsPinnedOrErr = Device.isPinnedPtrImpl(HstPtr, BaseHstPtr,
|
|
BaseDevAccessiblePtr, BaseSize);
|
|
if (!IsPinnedOrErr)
|
|
return IsPinnedOrErr.takeError();
|
|
|
|
// If pinned, just insert the entry representing the whole pinned buffer.
|
|
if (*IsPinnedOrErr)
|
|
return insertEntry(BaseHstPtr, BaseDevAccessiblePtr, BaseSize,
|
|
/*Externallylocked=*/true);
|
|
|
|
// Not externally pinned. Do nothing if locking of mapped buffers is disabled.
|
|
if (!LockMappedBuffers)
|
|
return Plugin::success();
|
|
|
|
// Otherwise, lock the buffer and insert the new entry.
|
|
auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
|
|
if (!DevAccessiblePtrOrErr) {
|
|
// Errors may be tolerated.
|
|
if (!IgnoreLockMappedFailures)
|
|
return DevAccessiblePtrOrErr.takeError();
|
|
|
|
consumeError(DevAccessiblePtrOrErr.takeError());
|
|
return Plugin::success();
|
|
}
|
|
|
|
return insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size);
|
|
}
|
|
|
|
Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
|
|
assert(HstPtr && "Invalid pointer");
|
|
|
|
std::lock_guard<std::shared_mutex> Lock(Mutex);
|
|
|
|
// Check whether there is any intersecting entry.
|
|
const EntryTy *Entry = findIntersecting(HstPtr);
|
|
|
|
// No entry but automatic locking of mapped buffers is disabled, so
|
|
// nothing to do.
|
|
if (!Entry && !LockMappedBuffers)
|
|
return Plugin::success();
|
|
|
|
// No entry, automatic locking is enabled, but the locking may have failed, so
|
|
// do nothing.
|
|
if (!Entry && IgnoreLockMappedFailures)
|
|
return Plugin::success();
|
|
|
|
// No entry, but the automatic locking is enabled, so this is an error.
|
|
if (!Entry)
|
|
return Plugin::error("Locked buffer not found");
|
|
|
|
// There is entry, so unregister a user and check whether it was the last one.
|
|
auto LastUseOrErr = unregisterEntryUse(*Entry);
|
|
if (!LastUseOrErr)
|
|
return LastUseOrErr.takeError();
|
|
|
|
// If it is not the last one, there is nothing to do.
|
|
if (!(*LastUseOrErr))
|
|
return Plugin::success();
|
|
|
|
// Otherwise, if it was the last and the buffer was locked by the plugin,
|
|
// unlock it.
|
|
if (!Entry->ExternallyLocked)
|
|
if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
|
|
return Err;
|
|
|
|
// Finally erase the entry from the map.
|
|
return eraseEntry(*Entry);
|
|
}
|
|
|
|
Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
|
|
if (!AsyncInfo || !AsyncInfo->Queue)
|
|
return Plugin::error("Invalid async info queue");
|
|
|
|
if (auto Err = synchronizeImpl(*AsyncInfo))
|
|
return Err;
|
|
|
|
for (auto *Ptr : AsyncInfo->AssociatedAllocations)
|
|
if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE))
|
|
return Err;
|
|
AsyncInfo->AssociatedAllocations.clear();
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
|
|
if (!AsyncInfo || !AsyncInfo->Queue)
|
|
return Plugin::error("Invalid async info queue");
|
|
|
|
return queryAsyncImpl(*AsyncInfo);
|
|
}
|
|
|
|
Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
|
|
return Plugin::error("Device does not support VA Management");
|
|
}
|
|
|
|
Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
|
|
return Plugin::error("Device does not support VA Management");
|
|
}
|
|
|
|
Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
|
|
return Plugin::error(
|
|
"Missing getDeviceMemorySize implementation (required by RR-heuristic");
|
|
}
|
|
|
|
Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
|
|
TargetAllocTy Kind) {
|
|
void *Alloc = nullptr;
|
|
|
|
if (Plugin.getRecordReplay().isRecordingOrReplaying())
|
|
return Plugin.getRecordReplay().alloc(Size);
|
|
|
|
switch (Kind) {
|
|
case TARGET_ALLOC_DEFAULT:
|
|
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
|
|
case TARGET_ALLOC_DEVICE:
|
|
if (MemoryManager) {
|
|
Alloc = MemoryManager->allocate(Size, HostPtr);
|
|
if (!Alloc)
|
|
return Plugin::error("Failed to allocate from memory manager");
|
|
break;
|
|
}
|
|
[[fallthrough]];
|
|
case TARGET_ALLOC_HOST:
|
|
case TARGET_ALLOC_SHARED:
|
|
Alloc = allocate(Size, HostPtr, Kind);
|
|
if (!Alloc)
|
|
return Plugin::error("Failed to allocate from device allocator");
|
|
}
|
|
|
|
// Report error if the memory manager or the device allocator did not return
|
|
// any memory buffer.
|
|
if (!Alloc)
|
|
return Plugin::error("Invalid target data allocation kind or requested "
|
|
"allocator not implemented yet");
|
|
|
|
// Register allocated buffer as pinned memory if the type is host memory.
|
|
if (Kind == TARGET_ALLOC_HOST)
|
|
if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
|
|
return std::move(Err);
|
|
|
|
// Keep track of the allocation stack if we track allocation traces.
|
|
if (OMPX_TrackAllocationTraces) {
|
|
std::string StackTrace;
|
|
llvm::raw_string_ostream OS(StackTrace);
|
|
llvm::sys::PrintStackTrace(OS);
|
|
|
|
AllocationTraceInfoTy *ATI = new AllocationTraceInfoTy();
|
|
ATI->AllocationTrace = std::move(StackTrace);
|
|
ATI->DevicePtr = Alloc;
|
|
ATI->HostPtr = HostPtr;
|
|
ATI->Size = Size;
|
|
ATI->Kind = Kind;
|
|
|
|
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
|
|
auto *&MapATI = (*AllocationTraceMap)[Alloc];
|
|
ATI->LastAllocationInfo = MapATI;
|
|
MapATI = ATI;
|
|
}
|
|
|
|
return Alloc;
|
|
}
|
|
|
|
Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
|
|
// Free is a noop when recording or replaying.
|
|
if (Plugin.getRecordReplay().isRecordingOrReplaying())
|
|
return Plugin::success();
|
|
|
|
// Keep track of the deallocation stack if we track allocation traces.
|
|
if (OMPX_TrackAllocationTraces) {
|
|
AllocationTraceInfoTy *ATI = nullptr;
|
|
{
|
|
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
|
|
ATI = (*AllocationTraceMap)[TgtPtr];
|
|
}
|
|
|
|
std::string StackTrace;
|
|
llvm::raw_string_ostream OS(StackTrace);
|
|
llvm::sys::PrintStackTrace(OS);
|
|
|
|
if (!ATI)
|
|
ErrorReporter::reportDeallocationOfNonAllocatedPtr(TgtPtr, Kind, ATI,
|
|
StackTrace);
|
|
|
|
// ATI is not null, thus we can lock it to inspect and modify it further.
|
|
std::lock_guard<std::mutex> LG(ATI->Lock);
|
|
if (!ATI->DeallocationTrace.empty())
|
|
ErrorReporter::reportDeallocationOfDeallocatedPtr(TgtPtr, Kind, ATI,
|
|
StackTrace);
|
|
|
|
if (ATI->Kind != Kind)
|
|
ErrorReporter::reportDeallocationOfWrongPtrKind(TgtPtr, Kind, ATI,
|
|
StackTrace);
|
|
|
|
ATI->DeallocationTrace = StackTrace;
|
|
|
|
#undef DEALLOCATION_ERROR
|
|
}
|
|
|
|
int Res;
|
|
switch (Kind) {
|
|
case TARGET_ALLOC_DEFAULT:
|
|
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
|
|
case TARGET_ALLOC_DEVICE:
|
|
if (MemoryManager) {
|
|
Res = MemoryManager->free(TgtPtr);
|
|
if (Res)
|
|
return Plugin::error(
|
|
"Failure to deallocate device pointer %p via memory manager",
|
|
TgtPtr);
|
|
break;
|
|
}
|
|
[[fallthrough]];
|
|
case TARGET_ALLOC_HOST:
|
|
case TARGET_ALLOC_SHARED:
|
|
Res = free(TgtPtr, Kind);
|
|
if (Res)
|
|
return Plugin::error(
|
|
"Failure to deallocate device pointer %p via device deallocator",
|
|
TgtPtr);
|
|
}
|
|
|
|
// Unregister deallocated pinned memory buffer if the type is host memory.
|
|
if (Kind == TARGET_ALLOC_HOST)
|
|
if (auto Err = PinnedAllocs.unregisterHostBuffer(TgtPtr))
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
|
|
int64_t Size, __tgt_async_info *AsyncInfo) {
|
|
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
|
|
|
|
auto Err = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
|
|
AsyncInfoWrapper.finalize(Err);
|
|
return Err;
|
|
}
|
|
|
|
Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
|
|
int64_t Size, __tgt_async_info *AsyncInfo) {
|
|
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
|
|
|
|
auto Err = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
|
|
AsyncInfoWrapper.finalize(Err);
|
|
return Err;
|
|
}
|
|
|
|
Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
|
|
void *DstPtr, int64_t Size,
|
|
__tgt_async_info *AsyncInfo) {
|
|
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
|
|
|
|
auto Err = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper);
|
|
AsyncInfoWrapper.finalize(Err);
|
|
return Err;
|
|
}
|
|
|
|
Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
|
|
ptrdiff_t *ArgOffsets,
|
|
KernelArgsTy &KernelArgs,
|
|
__tgt_async_info *AsyncInfo) {
|
|
AsyncInfoWrapperTy AsyncInfoWrapper(
|
|
*this,
|
|
Plugin.getRecordReplay().isRecordingOrReplaying() ? nullptr : AsyncInfo);
|
|
|
|
GenericKernelTy &GenericKernel =
|
|
*reinterpret_cast<GenericKernelTy *>(EntryPtr);
|
|
|
|
{
|
|
std::string StackTrace;
|
|
if (OMPX_TrackNumKernelLaunches) {
|
|
llvm::raw_string_ostream OS(StackTrace);
|
|
llvm::sys::PrintStackTrace(OS);
|
|
}
|
|
|
|
auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor();
|
|
(*KernelTraceInfoRecord)
|
|
.emplace(&GenericKernel, std::move(StackTrace), AsyncInfo);
|
|
}
|
|
|
|
auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
|
|
AsyncInfoWrapper);
|
|
|
|
// 'finalize' here to guarantee next record-replay actions are in-sync
|
|
AsyncInfoWrapper.finalize(Err);
|
|
|
|
RecordReplayTy &RecordReplay = Plugin.getRecordReplay();
|
|
if (RecordReplay.isRecordingOrReplaying() &&
|
|
RecordReplay.isSaveOutputEnabled())
|
|
RecordReplay.saveKernelOutputInfo(GenericKernel.getName());
|
|
|
|
return Err;
|
|
}
|
|
|
|
Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
|
|
assert(AsyncInfoPtr && "Invalid async info");
|
|
|
|
*AsyncInfoPtr = new __tgt_async_info();
|
|
|
|
AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
|
|
|
|
auto Err = initAsyncInfoImpl(AsyncInfoWrapper);
|
|
AsyncInfoWrapper.finalize(Err);
|
|
return Err;
|
|
}
|
|
|
|
Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
|
|
assert(DeviceInfo && "Invalid device info");
|
|
|
|
return initDeviceInfoImpl(DeviceInfo);
|
|
}
|
|
|
|
Error GenericDeviceTy::printInfo() {
|
|
InfoQueueTy InfoQueue;
|
|
|
|
// Get the vendor-specific info entries describing the device properties.
|
|
if (auto Err = obtainInfoImpl(InfoQueue))
|
|
return Err;
|
|
|
|
// Print all info entries.
|
|
InfoQueue.print();
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
|
|
return createEventImpl(EventPtrStorage);
|
|
}
|
|
|
|
Error GenericDeviceTy::destroyEvent(void *EventPtr) {
|
|
return destroyEventImpl(EventPtr);
|
|
}
|
|
|
|
Error GenericDeviceTy::recordEvent(void *EventPtr,
|
|
__tgt_async_info *AsyncInfo) {
|
|
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
|
|
|
|
auto Err = recordEventImpl(EventPtr, AsyncInfoWrapper);
|
|
AsyncInfoWrapper.finalize(Err);
|
|
return Err;
|
|
}
|
|
|
|
Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
|
|
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
|
|
|
|
auto Err = waitEventImpl(EventPtr, AsyncInfoWrapper);
|
|
AsyncInfoWrapper.finalize(Err);
|
|
return Err;
|
|
}
|
|
|
|
Error GenericDeviceTy::syncEvent(void *EventPtr) {
|
|
return syncEventImpl(EventPtr);
|
|
}
|
|
|
|
bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
|
|
|
|
Error GenericPluginTy::init() {
|
|
if (Initialized)
|
|
return Plugin::success();
|
|
|
|
auto NumDevicesOrErr = initImpl();
|
|
if (!NumDevicesOrErr)
|
|
return NumDevicesOrErr.takeError();
|
|
Initialized = true;
|
|
|
|
NumDevices = *NumDevicesOrErr;
|
|
if (NumDevices == 0)
|
|
return Plugin::success();
|
|
|
|
assert(Devices.size() == 0 && "Plugin already initialized");
|
|
Devices.resize(NumDevices, nullptr);
|
|
|
|
GlobalHandler = createGlobalHandler();
|
|
assert(GlobalHandler && "Invalid global handler");
|
|
|
|
RPCServer = new RPCServerTy(*this);
|
|
assert(RPCServer && "Invalid RPC server");
|
|
|
|
RecordReplay = new RecordReplayTy();
|
|
assert(RecordReplay && "Invalid RR interface");
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error GenericPluginTy::deinit() {
|
|
assert(Initialized && "Plugin was not initialized!");
|
|
|
|
// Deinitialize all active devices.
|
|
for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
|
|
if (Devices[DeviceId]) {
|
|
if (auto Err = deinitDevice(DeviceId))
|
|
return Err;
|
|
}
|
|
assert(!Devices[DeviceId] && "Device was not deinitialized");
|
|
}
|
|
|
|
// There is no global handler if no device is available.
|
|
if (GlobalHandler)
|
|
delete GlobalHandler;
|
|
|
|
if (RPCServer) {
|
|
if (Error Err = RPCServer->shutDown())
|
|
return Err;
|
|
delete RPCServer;
|
|
}
|
|
|
|
if (RecordReplay)
|
|
delete RecordReplay;
|
|
|
|
// Perform last deinitializations on the plugin.
|
|
if (Error Err = deinitImpl())
|
|
return Err;
|
|
Initialized = false;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error GenericPluginTy::initDevice(int32_t DeviceId) {
|
|
assert(!Devices[DeviceId] && "Device already initialized");
|
|
|
|
// Create the device and save the reference.
|
|
GenericDeviceTy *Device = createDevice(*this, DeviceId, NumDevices);
|
|
assert(Device && "Invalid device");
|
|
|
|
// Save the device reference into the list.
|
|
Devices[DeviceId] = Device;
|
|
|
|
// Initialize the device and its resources.
|
|
return Device->init(*this);
|
|
}
|
|
|
|
Error GenericPluginTy::deinitDevice(int32_t DeviceId) {
|
|
// The device may be already deinitialized.
|
|
if (Devices[DeviceId] == nullptr)
|
|
return Plugin::success();
|
|
|
|
// Deinitialize the device and release its resources.
|
|
if (auto Err = Devices[DeviceId]->deinit(*this))
|
|
return Err;
|
|
|
|
// Delete the device and invalidate its reference.
|
|
delete Devices[DeviceId];
|
|
Devices[DeviceId] = nullptr;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const {
|
|
// First check if this image is a regular ELF file.
|
|
if (!utils::elf::isELF(Image))
|
|
return false;
|
|
|
|
// Check if this image is an ELF with a matching machine value.
|
|
auto MachineOrErr = utils::elf::checkMachine(Image, getMagicElfBits());
|
|
if (!MachineOrErr)
|
|
return MachineOrErr.takeError();
|
|
|
|
return MachineOrErr;
|
|
}
|
|
|
|
Expected<bool> GenericPluginTy::checkBitcodeImage(StringRef Image) const {
|
|
if (identify_magic(Image) != file_magic::bitcode)
|
|
return false;
|
|
|
|
LLVMContext Context;
|
|
auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Image, ""), Context,
|
|
/*ShouldLazyLoadMetadata=*/true);
|
|
if (!ModuleOrErr)
|
|
return ModuleOrErr.takeError();
|
|
Module &M = **ModuleOrErr;
|
|
|
|
return Triple(M.getTargetTriple()).getArch() == getTripleArch();
|
|
}
|
|
|
|
int32_t GenericPluginTy::is_initialized() const { return Initialized; }
|
|
|
|
int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
|
|
StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
|
|
utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
|
|
|
|
auto HandleError = [&](Error Err) -> bool {
|
|
[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
|
|
DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
|
|
return false;
|
|
};
|
|
switch (identify_magic(Buffer)) {
|
|
case file_magic::elf:
|
|
case file_magic::elf_relocatable:
|
|
case file_magic::elf_executable:
|
|
case file_magic::elf_shared_object:
|
|
case file_magic::elf_core: {
|
|
auto MatchOrErr = checkELFImage(Buffer);
|
|
if (Error Err = MatchOrErr.takeError())
|
|
return HandleError(std::move(Err));
|
|
return *MatchOrErr;
|
|
}
|
|
case file_magic::bitcode: {
|
|
auto MatchOrErr = checkBitcodeImage(Buffer);
|
|
if (Error Err = MatchOrErr.takeError())
|
|
return HandleError(std::move(Err));
|
|
return *MatchOrErr;
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId,
|
|
__tgt_device_image *Image) {
|
|
StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
|
|
utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
|
|
|
|
auto HandleError = [&](Error Err) -> bool {
|
|
[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
|
|
DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
|
|
return false;
|
|
};
|
|
switch (identify_magic(Buffer)) {
|
|
case file_magic::elf:
|
|
case file_magic::elf_relocatable:
|
|
case file_magic::elf_executable:
|
|
case file_magic::elf_shared_object:
|
|
case file_magic::elf_core: {
|
|
auto MatchOrErr = checkELFImage(Buffer);
|
|
if (Error Err = MatchOrErr.takeError())
|
|
return HandleError(std::move(Err));
|
|
if (!*MatchOrErr)
|
|
return false;
|
|
|
|
// Perform plugin-dependent checks for the specific architecture if needed.
|
|
auto CompatibleOrErr = isELFCompatible(DeviceId, Buffer);
|
|
if (Error Err = CompatibleOrErr.takeError())
|
|
return HandleError(std::move(Err));
|
|
return *CompatibleOrErr;
|
|
}
|
|
case file_magic::bitcode: {
|
|
auto MatchOrErr = checkBitcodeImage(Buffer);
|
|
if (Error Err = MatchOrErr.takeError())
|
|
return HandleError(std::move(Err));
|
|
return *MatchOrErr;
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
int32_t GenericPluginTy::is_device_initialized(int32_t DeviceId) const {
|
|
return isValidDeviceId(DeviceId) && Devices[DeviceId] != nullptr;
|
|
}
|
|
|
|
int32_t GenericPluginTy::init_device(int32_t DeviceId) {
|
|
auto Err = initDevice(DeviceId);
|
|
if (Err) {
|
|
REPORT("Failure to initialize device %d: %s\n", DeviceId,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::number_of_devices() { return getNumDevices(); }
|
|
|
|
int32_t GenericPluginTy::is_data_exchangable(int32_t SrcDeviceId,
|
|
int32_t DstDeviceId) {
|
|
return isDataExchangable(SrcDeviceId, DstDeviceId);
|
|
}
|
|
|
|
int32_t GenericPluginTy::initialize_record_replay(int32_t DeviceId,
|
|
int64_t MemorySize,
|
|
void *VAddr, bool isRecord,
|
|
bool SaveOutput,
|
|
uint64_t &ReqPtrArgOffset) {
|
|
GenericDeviceTy &Device = getDevice(DeviceId);
|
|
RecordReplayTy::RRStatusTy Status =
|
|
isRecord ? RecordReplayTy::RRStatusTy::RRRecording
|
|
: RecordReplayTy::RRStatusTy::RRReplaying;
|
|
|
|
if (auto Err = RecordReplay->init(&Device, MemorySize, VAddr, Status,
|
|
SaveOutput, ReqPtrArgOffset)) {
|
|
REPORT("WARNING RR did not initialize RR-properly with %lu bytes"
|
|
"(Error: %s)\n",
|
|
MemorySize, toString(std::move(Err)).data());
|
|
RecordReplay->setStatus(RecordReplayTy::RRStatusTy::RRDeactivated);
|
|
|
|
if (!isRecord) {
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
}
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::load_binary(int32_t DeviceId,
|
|
__tgt_device_image *TgtImage,
|
|
__tgt_device_binary *Binary) {
|
|
GenericDeviceTy &Device = getDevice(DeviceId);
|
|
|
|
auto ImageOrErr = Device.loadBinary(*this, TgtImage);
|
|
if (!ImageOrErr) {
|
|
auto Err = ImageOrErr.takeError();
|
|
REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
|
|
DeviceId, toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
DeviceImageTy *Image = *ImageOrErr;
|
|
assert(Image != nullptr && "Invalid Image");
|
|
|
|
*Binary = __tgt_device_binary{reinterpret_cast<uint64_t>(Image)};
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
void *GenericPluginTy::data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
|
|
int32_t Kind) {
|
|
auto AllocOrErr =
|
|
getDevice(DeviceId).dataAlloc(Size, HostPtr, (TargetAllocTy)Kind);
|
|
if (!AllocOrErr) {
|
|
auto Err = AllocOrErr.takeError();
|
|
REPORT("Failure to allocate device memory: %s\n",
|
|
toString(std::move(Err)).data());
|
|
return nullptr;
|
|
}
|
|
assert(*AllocOrErr && "Null pointer upon successful allocation");
|
|
|
|
return *AllocOrErr;
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_delete(int32_t DeviceId, void *TgtPtr,
|
|
int32_t Kind) {
|
|
auto Err =
|
|
getDevice(DeviceId).dataDelete(TgtPtr, static_cast<TargetAllocTy>(Kind));
|
|
if (Err) {
|
|
REPORT("Failure to deallocate device pointer %p: %s\n", TgtPtr,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
|
|
void **LockedPtr) {
|
|
auto LockedPtrOrErr = getDevice(DeviceId).dataLock(Ptr, Size);
|
|
if (!LockedPtrOrErr) {
|
|
auto Err = LockedPtrOrErr.takeError();
|
|
REPORT("Failure to lock memory %p: %s\n", Ptr,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
if (!(*LockedPtrOrErr)) {
|
|
REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr);
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
*LockedPtr = *LockedPtrOrErr;
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) {
|
|
auto Err = getDevice(DeviceId).dataUnlock(Ptr);
|
|
if (Err) {
|
|
REPORT("Failure to unlock memory %p: %s\n", Ptr,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr,
|
|
int64_t Size) {
|
|
auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size);
|
|
if (Err) {
|
|
REPORT("Failure to notify data mapped %p: %s\n", HstPtr,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) {
|
|
auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr);
|
|
if (Err) {
|
|
REPORT("Failure to notify data unmapped %p: %s\n", HstPtr,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_submit(int32_t DeviceId, void *TgtPtr,
|
|
void *HstPtr, int64_t Size) {
|
|
return data_submit_async(DeviceId, TgtPtr, HstPtr, Size,
|
|
/*AsyncInfoPtr=*/nullptr);
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_submit_async(int32_t DeviceId, void *TgtPtr,
|
|
void *HstPtr, int64_t Size,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
auto Err = getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size, AsyncInfoPtr);
|
|
if (Err) {
|
|
REPORT("Failure to copy data from host to device. Pointers: host "
|
|
"= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
|
|
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_retrieve(int32_t DeviceId, void *HstPtr,
|
|
void *TgtPtr, int64_t Size) {
|
|
return data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
|
|
/*AsyncInfoPtr=*/nullptr);
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_retrieve_async(int32_t DeviceId, void *HstPtr,
|
|
void *TgtPtr, int64_t Size,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
auto Err =
|
|
getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size, AsyncInfoPtr);
|
|
if (Err) {
|
|
REPORT("Failure to copy data from device to host. Pointers: host "
|
|
"= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
|
|
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_exchange(int32_t SrcDeviceId, void *SrcPtr,
|
|
int32_t DstDeviceId, void *DstPtr,
|
|
int64_t Size) {
|
|
return data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr, Size,
|
|
/*AsyncInfoPtr=*/nullptr);
|
|
}
|
|
|
|
int32_t GenericPluginTy::data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
|
|
int DstDeviceId, void *DstPtr,
|
|
int64_t Size,
|
|
__tgt_async_info *AsyncInfo) {
|
|
GenericDeviceTy &SrcDevice = getDevice(SrcDeviceId);
|
|
GenericDeviceTy &DstDevice = getDevice(DstDeviceId);
|
|
auto Err = SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo);
|
|
if (Err) {
|
|
REPORT("Failure to copy data from device (%d) to device (%d). Pointers: "
|
|
"host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
|
|
SrcDeviceId, DstDeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
|
|
void **TgtArgs, ptrdiff_t *TgtOffsets,
|
|
KernelArgsTy *KernelArgs,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
auto Err = getDevice(DeviceId).launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets,
|
|
*KernelArgs, AsyncInfoPtr);
|
|
if (Err) {
|
|
REPORT("Failure to run target region " DPxMOD " in device %d: %s\n",
|
|
DPxPTR(TgtEntryPtr), DeviceId, toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::synchronize(int32_t DeviceId,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
auto Err = getDevice(DeviceId).synchronize(AsyncInfoPtr);
|
|
if (Err) {
|
|
REPORT("Failure to synchronize stream %p: %s\n", AsyncInfoPtr->Queue,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::query_async(int32_t DeviceId,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
auto Err = getDevice(DeviceId).queryAsync(AsyncInfoPtr);
|
|
if (Err) {
|
|
REPORT("Failure to query stream %p: %s\n", AsyncInfoPtr->Queue,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
void GenericPluginTy::print_device_info(int32_t DeviceId) {
|
|
if (auto Err = getDevice(DeviceId).printInfo())
|
|
REPORT("Failure to print device %d info: %s\n", DeviceId,
|
|
toString(std::move(Err)).data());
|
|
}
|
|
|
|
int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) {
|
|
auto Err = getDevice(DeviceId).createEvent(EventPtr);
|
|
if (Err) {
|
|
REPORT("Failure to create event: %s\n", toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::record_event(int32_t DeviceId, void *EventPtr,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
auto Err = getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr);
|
|
if (Err) {
|
|
REPORT("Failure to record event %p: %s\n", EventPtr,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::wait_event(int32_t DeviceId, void *EventPtr,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
auto Err = getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr);
|
|
if (Err) {
|
|
REPORT("Failure to wait event %p: %s\n", EventPtr,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::sync_event(int32_t DeviceId, void *EventPtr) {
|
|
auto Err = getDevice(DeviceId).syncEvent(EventPtr);
|
|
if (Err) {
|
|
REPORT("Failure to synchronize event %p: %s\n", EventPtr,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::destroy_event(int32_t DeviceId, void *EventPtr) {
|
|
auto Err = getDevice(DeviceId).destroyEvent(EventPtr);
|
|
if (Err) {
|
|
REPORT("Failure to destroy event %p: %s\n", EventPtr,
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
void GenericPluginTy::set_info_flag(uint32_t NewInfoLevel) {
|
|
std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
|
|
InfoLevel.store(NewInfoLevel);
|
|
}
|
|
|
|
int32_t GenericPluginTy::init_async_info(int32_t DeviceId,
|
|
__tgt_async_info **AsyncInfoPtr) {
|
|
assert(AsyncInfoPtr && "Invalid async info");
|
|
|
|
auto Err = getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr);
|
|
if (Err) {
|
|
REPORT("Failure to initialize async info at " DPxMOD " on device %d: %s\n",
|
|
DPxPTR(*AsyncInfoPtr), DeviceId, toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::init_device_info(int32_t DeviceId,
|
|
__tgt_device_info *DeviceInfo,
|
|
const char **ErrStr) {
|
|
*ErrStr = "";
|
|
|
|
auto Err = getDevice(DeviceId).initDeviceInfo(DeviceInfo);
|
|
if (Err) {
|
|
REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n",
|
|
DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::set_device_identifier(int32_t UserId,
|
|
int32_t DeviceId) {
|
|
UserDeviceIds[DeviceId] = UserId;
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::use_auto_zero_copy(int32_t DeviceId) {
|
|
return getDevice(DeviceId).useAutoZeroCopy();
|
|
}
|
|
|
|
int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
|
|
const char *Name, void **DevicePtr) {
|
|
assert(Binary.handle && "Invalid device binary handle");
|
|
DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
|
|
|
|
GenericDeviceTy &Device = Image.getDevice();
|
|
|
|
GlobalTy DeviceGlobal(Name, Size);
|
|
GenericGlobalHandlerTy &GHandler = getGlobalHandler();
|
|
if (auto Err =
|
|
GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
|
|
REPORT("Failure to look up global address: %s\n",
|
|
toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
*DevicePtr = DeviceGlobal.getPtr();
|
|
assert(DevicePtr && "Invalid device global's address");
|
|
|
|
// Save the loaded globals if we are recording.
|
|
RecordReplayTy &RecordReplay = Device.Plugin.getRecordReplay();
|
|
if (RecordReplay.isRecording())
|
|
RecordReplay.addEntry(Name, Size, *DevicePtr);
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
|
|
const char *Name, void **KernelPtr) {
|
|
assert(Binary.handle && "Invalid device binary handle");
|
|
DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
|
|
|
|
GenericDeviceTy &Device = Image.getDevice();
|
|
|
|
auto KernelOrErr = Device.constructKernel(Name);
|
|
if (Error Err = KernelOrErr.takeError()) {
|
|
REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
GenericKernelTy &Kernel = *KernelOrErr;
|
|
if (auto Err = Kernel.init(Device, Image)) {
|
|
REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
// Note that this is not the kernel's device address.
|
|
*KernelPtr = &Kernel;
|
|
return OFFLOAD_SUCCESS;
|
|
}
|