Previously we decided to check in files that we generate with tablegen. The justification at the time was that it helped reviewers unfamiliar with `offload-tblgen` see the actual changes to the headers in PRs. After trying it for a while, it's ended up causing some headaches and is also not how tablegen is used elsewhere in LLVM. This changes our use of tablegen to be more conventional. Where possible, files are still clang-formatted, but this is no longer a hard requirement. Because `OffloadErrcodes.inc` is shared with libomptarget it now gets generated in a more appropriate place.
1607 lines
62 KiB
C++
1607 lines
62 KiB
C++
//===- PluginInterface.h - Target independent plugin device interface -----===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
|
|
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
|
|
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <deque>
|
|
#include <list>
|
|
#include <map>
|
|
#include <shared_mutex>
|
|
#include <vector>
|
|
|
|
#include "ExclusiveAccess.h"
|
|
#include "Shared/APITypes.h"
|
|
#include "Shared/Debug.h"
|
|
#include "Shared/Environment.h"
|
|
#include "Shared/EnvironmentVar.h"
|
|
#include "Shared/Requirements.h"
|
|
#include "Shared/Utils.h"
|
|
|
|
#include "GlobalHandler.h"
|
|
#include "JIT.h"
|
|
#include "MemoryManager.h"
|
|
#include "OffloadError.h"
|
|
#include "RPC.h"
|
|
#include "omptarget.h"
|
|
|
|
#ifdef OMPT_SUPPORT
|
|
#include "omp-tools.h"
|
|
#endif
|
|
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
|
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
|
#include "llvm/Support/Allocator.h"
|
|
#include "llvm/Support/Error.h"
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
#include "llvm/Support/MemoryBufferRef.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/TargetParser/Triple.h"
|
|
|
|
namespace llvm {
|
|
namespace omp {
|
|
namespace target {
|
|
|
|
namespace plugin {
|
|
|
|
struct GenericPluginTy;
|
|
struct GenericKernelTy;
|
|
struct GenericDeviceTy;
|
|
struct RecordReplayTy;
|
|
|
|
/// Class that wraps the __tgt_async_info to simply its usage. In case the
|
|
/// object is constructed without a valid __tgt_async_info, the object will use
|
|
/// an internal one and will synchronize the current thread with the pending
|
|
/// operations when calling AsyncInfoWrapperTy::finalize(). This latter function
|
|
/// must be called before destroying the wrapper object.
|
|
struct AsyncInfoWrapperTy {
|
|
AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr);
|
|
|
|
~AsyncInfoWrapperTy() {
|
|
assert(!AsyncInfoPtr && "AsyncInfoWrapperTy not finalized");
|
|
}
|
|
|
|
/// Get the raw __tgt_async_info pointer.
|
|
operator __tgt_async_info *() const { return AsyncInfoPtr; }
|
|
|
|
/// Indicate whether there is queue.
|
|
bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); }
|
|
|
|
/// Get the queue.
|
|
template <typename Ty> Ty getQueueAs() {
|
|
static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
|
|
"Queue is not of the same size as target type");
|
|
return static_cast<Ty>(AsyncInfoPtr->Queue);
|
|
}
|
|
|
|
/// Set the queue.
|
|
template <typename Ty> void setQueueAs(Ty Queue) {
|
|
static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
|
|
"Queue is not of the same size as target type");
|
|
assert(!AsyncInfoPtr->Queue && "Overwriting queue");
|
|
AsyncInfoPtr->Queue = Queue;
|
|
}
|
|
|
|
/// Synchronize with the __tgt_async_info's pending operations if it's the
|
|
/// internal async info. The error associated to the asynchronous operations
|
|
/// issued in this queue must be provided in \p Err. This function will update
|
|
/// the error parameter with the result of the synchronization if it was
|
|
/// actually executed. This function must be called before destroying the
|
|
/// object and only once.
|
|
void finalize(Error &Err);
|
|
|
|
/// Register \p Ptr as an associated allocation that is freed after
|
|
/// finalization.
|
|
void freeAllocationAfterSynchronization(void *Ptr) {
|
|
AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
|
|
}
|
|
|
|
private:
|
|
GenericDeviceTy &Device;
|
|
__tgt_async_info LocalAsyncInfo;
|
|
__tgt_async_info *AsyncInfoPtr;
|
|
};
|
|
|
|
/// The information level represents the level of a key-value property in the
|
|
/// info tree print (i.e. indentation). The first level should be the default.
|
|
enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
|
|
|
|
/// Class for storing device information and later be printed. An object of this
|
|
/// type acts as a queue of key-value properties. Each property has a key, a
|
|
/// a value, and an optional unit for the value. For printing purposes, the
|
|
/// information can be classified into several levels. These levels are useful
|
|
/// for defining sections and subsections. Thus, each key-value property also
|
|
/// has an additional field indicating to which level belongs to. Notice that
|
|
/// we use the level to determine the indentation of the key-value property at
|
|
/// printing time. See the enum InfoLevelKind for the list of accepted levels.
|
|
class InfoQueueTy {
|
|
public:
|
|
struct InfoQueueEntryTy {
|
|
std::string Key;
|
|
std::string Value;
|
|
std::string Units;
|
|
uint64_t Level;
|
|
};
|
|
|
|
private:
|
|
std::deque<InfoQueueEntryTy> Queue;
|
|
|
|
public:
|
|
/// Add a new info entry to the queue. The entry requires at least a key
|
|
/// string in \p Key. The value in \p Value is optional and can be any type
|
|
/// that is representable as a string. The units in \p Units is optional and
|
|
/// must be a string. The info level is a template parameter that defaults to
|
|
/// the first level (top level).
|
|
template <InfoLevelKind L = InfoLevel1, typename T = std::string>
|
|
void add(const std::string &Key, T Value = T(),
|
|
const std::string &Units = std::string()) {
|
|
assert(!Key.empty() && "Invalid info key");
|
|
|
|
// Convert the value to a string depending on its type.
|
|
if constexpr (std::is_same_v<T, bool>)
|
|
Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
|
|
else if constexpr (std::is_arithmetic_v<T>)
|
|
Queue.push_back({Key, std::to_string(Value), Units, L});
|
|
else
|
|
Queue.push_back({Key, Value, Units, L});
|
|
}
|
|
|
|
const std::deque<InfoQueueEntryTy> &getQueue() const { return Queue; }
|
|
|
|
/// Print all info entries added to the queue.
|
|
void print() const {
|
|
// We print four spances for each level.
|
|
constexpr uint64_t IndentSize = 4;
|
|
|
|
// Find the maximum key length (level + key) to compute the individual
|
|
// indentation of each entry.
|
|
uint64_t MaxKeySize = 0;
|
|
for (const auto &Entry : Queue) {
|
|
uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
|
|
if (KeySize > MaxKeySize)
|
|
MaxKeySize = KeySize;
|
|
}
|
|
|
|
// Print all info entries.
|
|
for (const auto &Entry : Queue) {
|
|
// Compute the indentations for the current entry.
|
|
uint64_t KeyIndentSize = Entry.Level * IndentSize;
|
|
uint64_t ValIndentSize =
|
|
MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
|
|
|
|
llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
|
|
<< std::string(ValIndentSize, ' ') << Entry.Value
|
|
<< (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
|
|
}
|
|
}
|
|
};
|
|
|
|
/// Class wrapping a __tgt_device_image and its offload entry table on a
|
|
/// specific device. This class is responsible for storing and managing
|
|
/// the offload entries for an image on a device.
|
|
class DeviceImageTy {
|
|
/// Image identifier within the corresponding device. Notice that this id is
|
|
/// not unique between different device; they may overlap.
|
|
int32_t ImageId;
|
|
|
|
/// The pointer to the raw __tgt_device_image.
|
|
const __tgt_device_image *TgtImage;
|
|
const __tgt_device_image *TgtImageBitcode;
|
|
|
|
/// Reference to the device this image is loaded on.
|
|
GenericDeviceTy &Device;
|
|
|
|
/// If this image has any global destructors that much be called.
|
|
/// FIXME: This is only required because we currently have no invariants
|
|
/// towards the lifetime of the underlying image. We should either copy
|
|
/// the image into memory locally or erase the pointers after init.
|
|
bool PendingGlobalDtors;
|
|
|
|
public:
|
|
DeviceImageTy(int32_t Id, GenericDeviceTy &Device,
|
|
const __tgt_device_image *Image)
|
|
: ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device),
|
|
PendingGlobalDtors(false) {
|
|
assert(TgtImage && "Invalid target image");
|
|
}
|
|
|
|
/// Get the image identifier within the device.
|
|
int32_t getId() const { return ImageId; }
|
|
|
|
/// Get the device that this image is loaded onto.
|
|
GenericDeviceTy &getDevice() const { return Device; }
|
|
|
|
/// Get the pointer to the raw __tgt_device_image.
|
|
const __tgt_device_image *getTgtImage() const { return TgtImage; }
|
|
|
|
void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) {
|
|
this->TgtImageBitcode = TgtImageBitcode;
|
|
}
|
|
|
|
const __tgt_device_image *getTgtImageBitcode() const {
|
|
return TgtImageBitcode;
|
|
}
|
|
|
|
/// Get the image starting address.
|
|
void *getStart() const { return TgtImage->ImageStart; }
|
|
|
|
/// Get the image size.
|
|
size_t getSize() const {
|
|
return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
|
|
}
|
|
|
|
/// Get a memory buffer reference to the whole image.
|
|
MemoryBufferRef getMemoryBuffer() const {
|
|
return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
|
|
"Image");
|
|
}
|
|
/// Accessors to the boolean value
|
|
bool setPendingGlobalDtors() { return PendingGlobalDtors = true; }
|
|
bool hasPendingGlobalDtors() const { return PendingGlobalDtors; }
|
|
};
|
|
|
|
/// Class implementing common functionalities of offload kernels. Each plugin
|
|
/// should define the specific kernel class, derive from this generic one, and
|
|
/// implement the necessary virtual function members.
|
|
struct GenericKernelTy {
|
|
/// Construct a kernel with a name and a execution mode.
|
|
GenericKernelTy(const char *Name)
|
|
: Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {}
|
|
|
|
virtual ~GenericKernelTy() {}
|
|
|
|
/// Initialize the kernel object from a specific device.
|
|
Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image);
|
|
virtual Error initImpl(GenericDeviceTy &GenericDevice,
|
|
DeviceImageTy &Image) = 0;
|
|
|
|
/// Launch the kernel on the specific device. The device must be the same
|
|
/// one used to initialize the kernel.
|
|
Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
|
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
|
|
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
|
|
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
|
KernelArgsTy &KernelArgs,
|
|
KernelLaunchParamsTy LaunchParams,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
|
|
|
|
/// Get the kernel name.
|
|
const char *getName() const { return Name; }
|
|
|
|
/// Get the kernel image.
|
|
DeviceImageTy &getImage() const {
|
|
assert(ImagePtr && "Kernel is not initialized!");
|
|
return *ImagePtr;
|
|
}
|
|
|
|
/// Return the kernel environment object for kernel \p Name.
|
|
const KernelEnvironmentTy &getKernelEnvironmentForKernel() {
|
|
return KernelEnvironment;
|
|
}
|
|
|
|
/// Return a device pointer to a new kernel launch environment.
|
|
Expected<KernelLaunchEnvironmentTy *>
|
|
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
|
|
AsyncInfoWrapperTy &AsyncInfo) const;
|
|
|
|
/// Indicate whether an execution mode is valid.
|
|
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
|
|
switch (ExecutionMode) {
|
|
case OMP_TGT_EXEC_MODE_BARE:
|
|
case OMP_TGT_EXEC_MODE_SPMD:
|
|
case OMP_TGT_EXEC_MODE_GENERIC:
|
|
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
protected:
|
|
/// Get the execution mode name of the kernel.
|
|
const char *getExecutionModeName() const {
|
|
switch (KernelEnvironment.Configuration.ExecMode) {
|
|
case OMP_TGT_EXEC_MODE_BARE:
|
|
return "BARE";
|
|
case OMP_TGT_EXEC_MODE_SPMD:
|
|
return "SPMD";
|
|
case OMP_TGT_EXEC_MODE_GENERIC:
|
|
return "Generic";
|
|
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
|
|
return "Generic-SPMD";
|
|
}
|
|
llvm_unreachable("Unknown execution mode!");
|
|
}
|
|
|
|
/// Prints generic kernel launch information.
|
|
Error printLaunchInfo(GenericDeviceTy &GenericDevice,
|
|
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
|
|
uint32_t NumBlocks[3]) const;
|
|
|
|
/// Prints plugin-specific kernel launch information after generic kernel
|
|
/// launch information
|
|
virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
|
KernelArgsTy &KernelArgs,
|
|
uint32_t NumThreads[3],
|
|
uint32_t NumBlocks[3]) const;
|
|
|
|
private:
|
|
/// Prepare the arguments before launching the kernel.
|
|
KernelLaunchParamsTy
|
|
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
|
ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
|
|
llvm::SmallVectorImpl<void *> &Args,
|
|
llvm::SmallVectorImpl<void *> &Ptrs,
|
|
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
|
|
|
|
/// Get the number of threads and blocks for the kernel based on the
|
|
/// user-defined threads and block clauses.
|
|
uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
|
|
uint32_t ThreadLimitClause[3]) const;
|
|
|
|
/// The number of threads \p NumThreads can be adjusted by this method.
|
|
/// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
|
|
/// thread_limit clause.
|
|
uint32_t getNumBlocks(GenericDeviceTy &GenericDevice,
|
|
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
|
|
uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
|
|
|
|
/// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
|
|
bool isGenericSPMDMode() const {
|
|
return KernelEnvironment.Configuration.ExecMode ==
|
|
OMP_TGT_EXEC_MODE_GENERIC_SPMD;
|
|
}
|
|
bool isGenericMode() const {
|
|
return KernelEnvironment.Configuration.ExecMode ==
|
|
OMP_TGT_EXEC_MODE_GENERIC;
|
|
}
|
|
bool isSPMDMode() const {
|
|
return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD;
|
|
}
|
|
bool isBareMode() const {
|
|
return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE;
|
|
}
|
|
|
|
/// The kernel name.
|
|
const char *Name;
|
|
|
|
/// The image that contains this kernel.
|
|
DeviceImageTy *ImagePtr = nullptr;
|
|
|
|
protected:
|
|
/// The preferred number of threads to run the kernel.
|
|
uint32_t PreferredNumThreads;
|
|
|
|
/// The maximum number of threads which the kernel could leverage.
|
|
uint32_t MaxNumThreads;
|
|
|
|
/// The kernel environment, including execution flags.
|
|
KernelEnvironmentTy KernelEnvironment;
|
|
|
|
/// The prototype kernel launch environment.
|
|
KernelLaunchEnvironmentTy KernelLaunchEnvironment;
|
|
};
|
|
|
|
/// Information about an allocation, when it has been allocated, and when/if it
|
|
/// has been deallocated, for error reporting purposes.
|
|
struct AllocationTraceInfoTy {
|
|
|
|
/// The stack trace of the allocation itself.
|
|
std::string AllocationTrace;
|
|
|
|
/// The stack trace of the deallocation, or empty.
|
|
std::string DeallocationTrace;
|
|
|
|
/// The allocated device pointer.
|
|
void *DevicePtr = nullptr;
|
|
|
|
/// The corresponding host pointer (can be null).
|
|
void *HostPtr = nullptr;
|
|
|
|
/// The size of the allocation.
|
|
uint64_t Size = 0;
|
|
|
|
/// The kind of the allocation.
|
|
TargetAllocTy Kind = TargetAllocTy::TARGET_ALLOC_DEFAULT;
|
|
|
|
/// Information about the last allocation at this address, if any.
|
|
AllocationTraceInfoTy *LastAllocationInfo = nullptr;
|
|
|
|
/// Lock to keep accesses race free.
|
|
std::mutex Lock;
|
|
};
|
|
|
|
/// Information about an allocation, when it has been allocated, and when/if it
|
|
/// has been deallocated, for error reporting purposes.
|
|
struct KernelTraceInfoTy {
|
|
|
|
/// The launched kernel.
|
|
GenericKernelTy *Kernel;
|
|
|
|
/// The stack trace of the launch itself.
|
|
std::string LaunchTrace;
|
|
|
|
/// The async info the kernel was launched in.
|
|
__tgt_async_info *AsyncInfo;
|
|
};
|
|
|
|
struct KernelTraceInfoRecordTy {
|
|
KernelTraceInfoRecordTy() { KTIs.fill({}); }
|
|
|
|
/// Return the (maximal) record size.
|
|
auto size() const { return KTIs.size(); }
|
|
|
|
/// Create a new kernel trace info and add it into the record.
|
|
void emplace(GenericKernelTy *Kernel, const std::string &&StackTrace,
|
|
__tgt_async_info *AsyncInfo) {
|
|
KTIs[Idx] = {Kernel, std::move(StackTrace), AsyncInfo};
|
|
Idx = (Idx + 1) % size();
|
|
}
|
|
|
|
/// Return the \p I'th last kernel trace info.
|
|
auto getKernelTraceInfo(int32_t I) const {
|
|
// Note that kernel trace infos "grow forward", so lookup is backwards.
|
|
return KTIs[(Idx - I - 1 + size()) % size()];
|
|
}
|
|
|
|
private:
|
|
std::array<KernelTraceInfoTy, 8> KTIs;
|
|
unsigned Idx = 0;
|
|
};
|
|
|
|
/// Class representing a map of host pinned allocations. We track these pinned
|
|
/// allocations, so memory transfers involving these buffers can be optimized.
|
|
class PinnedAllocationMapTy {
|
|
|
|
/// Struct representing a map entry.
|
|
struct EntryTy {
|
|
/// The host pointer of the pinned allocation.
|
|
void *HstPtr;
|
|
|
|
/// The pointer that devices' driver should use to transfer data from/to the
|
|
/// pinned allocation. In most plugins, this pointer will be the same as the
|
|
/// host pointer above.
|
|
void *DevAccessiblePtr;
|
|
|
|
/// The size of the pinned allocation.
|
|
size_t Size;
|
|
|
|
/// Indicate whether the allocation was locked from outside the plugin, for
|
|
/// instance, from the application. The externally locked allocations are
|
|
/// not unlocked by the plugin when unregistering the last user.
|
|
bool ExternallyLocked;
|
|
|
|
/// The number of references to the pinned allocation. The allocation should
|
|
/// remain pinned and registered to the map until the number of references
|
|
/// becomes zero.
|
|
mutable size_t References;
|
|
|
|
/// Create an entry with the host and device accessible pointers, the buffer
|
|
/// size, and a boolean indicating whether the buffer was locked externally.
|
|
EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size,
|
|
bool ExternallyLocked)
|
|
: HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size),
|
|
ExternallyLocked(ExternallyLocked), References(1) {}
|
|
|
|
/// Utility constructor used for std::set searches.
|
|
EntryTy(void *HstPtr)
|
|
: HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0),
|
|
ExternallyLocked(false), References(0) {}
|
|
};
|
|
|
|
/// Comparator of mep entries. Use the host pointer to enforce an order
|
|
/// between entries.
|
|
struct EntryCmpTy {
|
|
bool operator()(const EntryTy &Left, const EntryTy &Right) const {
|
|
return Left.HstPtr < Right.HstPtr;
|
|
}
|
|
};
|
|
|
|
typedef std::set<EntryTy, EntryCmpTy> PinnedAllocSetTy;
|
|
|
|
/// The map of host pinned allocations.
|
|
PinnedAllocSetTy Allocs;
|
|
|
|
/// The mutex to protect accesses to the map.
|
|
mutable std::shared_mutex Mutex;
|
|
|
|
/// Reference to the corresponding device.
|
|
GenericDeviceTy &Device;
|
|
|
|
/// Indicate whether mapped host buffers should be locked automatically.
|
|
bool LockMappedBuffers;
|
|
|
|
/// Indicate whether failures when locking mapped buffers should be ignored.
|
|
bool IgnoreLockMappedFailures;
|
|
|
|
/// Find an allocation that intersects with \p HstPtr pointer. Assume the
|
|
/// map's mutex is acquired.
|
|
const EntryTy *findIntersecting(const void *HstPtr) const {
|
|
if (Allocs.empty())
|
|
return nullptr;
|
|
|
|
// Search the first allocation with starting address that is not less than
|
|
// the buffer address.
|
|
auto It = Allocs.lower_bound({const_cast<void *>(HstPtr)});
|
|
|
|
// Direct match of starting addresses.
|
|
if (It != Allocs.end() && It->HstPtr == HstPtr)
|
|
return &(*It);
|
|
|
|
// Not direct match but may be a previous pinned allocation in the map which
|
|
// contains the buffer. Return false if there is no such a previous
|
|
// allocation.
|
|
if (It == Allocs.begin())
|
|
return nullptr;
|
|
|
|
// Move to the previous pinned allocation.
|
|
--It;
|
|
|
|
// The buffer is not contained in the pinned allocation.
|
|
if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr)
|
|
return &(*It);
|
|
|
|
// None found.
|
|
return nullptr;
|
|
}
|
|
|
|
/// Insert an entry to the map representing a locked buffer. The number of
|
|
/// references is set to one.
|
|
Error insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size,
|
|
bool ExternallyLocked = false);
|
|
|
|
/// Erase an existing entry from the map.
|
|
Error eraseEntry(const EntryTy &Entry);
|
|
|
|
/// Register a new user into an entry that represents a locked buffer. Check
|
|
/// also that the registered buffer with \p HstPtr address and \p Size is
|
|
/// actually contained into the entry.
|
|
Error registerEntryUse(const EntryTy &Entry, void *HstPtr, size_t Size);
|
|
|
|
/// Unregister a user from the entry and return whether it is the last user.
|
|
/// If it is the last user, the entry will have to be removed from the map
|
|
/// and unlock the entry's host buffer (if necessary).
|
|
Expected<bool> unregisterEntryUse(const EntryTy &Entry);
|
|
|
|
/// Indicate whether the first range A fully contains the second range B.
|
|
static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
|
|
void *EndA = utils::advancePtr(PtrA, SizeA);
|
|
void *EndB = utils::advancePtr(PtrB, SizeB);
|
|
return (PtrB >= PtrA && EndB <= EndA);
|
|
}
|
|
|
|
/// Indicate whether the first range A intersects with the second range B.
|
|
static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
|
|
void *EndA = utils::advancePtr(PtrA, SizeA);
|
|
void *EndB = utils::advancePtr(PtrB, SizeB);
|
|
return (PtrA < EndB && PtrB < EndA);
|
|
}
|
|
|
|
public:
|
|
/// Create the map of pinned allocations corresponding to a specific device.
|
|
PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) {
|
|
|
|
// Envar that indicates whether mapped host buffers should be locked
|
|
// automatically. The possible values are boolean (on/off) and a special:
|
|
// off: Mapped host buffers are not locked.
|
|
// on: Mapped host buffers are locked in a best-effort approach.
|
|
// Failure to lock the buffers are silent.
|
|
// mandatory: Mapped host buffers are always locked and failures to lock
|
|
// a buffer results in a fatal error.
|
|
StringEnvar OMPX_LockMappedBuffers("LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS",
|
|
"off");
|
|
|
|
bool Enabled;
|
|
if (StringParser::parse(OMPX_LockMappedBuffers.get().data(), Enabled)) {
|
|
// Parsed as a boolean value. Enable the feature if necessary.
|
|
LockMappedBuffers = Enabled;
|
|
IgnoreLockMappedFailures = true;
|
|
} else if (OMPX_LockMappedBuffers.get() == "mandatory") {
|
|
// Enable the feature and failures are fatal.
|
|
LockMappedBuffers = true;
|
|
IgnoreLockMappedFailures = false;
|
|
} else {
|
|
// Disable by default.
|
|
DP("Invalid value LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS=%s\n",
|
|
OMPX_LockMappedBuffers.get().data());
|
|
LockMappedBuffers = false;
|
|
}
|
|
}
|
|
|
|
/// Register a buffer that was recently allocated as a locked host buffer.
|
|
/// None of the already registered pinned allocations should intersect with
|
|
/// this new one. The registration requires the host pointer in \p HstPtr,
|
|
/// the device accessible pointer in \p DevAccessiblePtr, and the size of the
|
|
/// allocation in \p Size. The allocation must be unregistered using the
|
|
/// unregisterHostBuffer function.
|
|
Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size);
|
|
|
|
/// Unregister a host pinned allocation passing the host pointer which was
|
|
/// previously registered using the registerHostBuffer function. When calling
|
|
/// this function, the pinned allocation cannot have any other user and will
|
|
/// not be unlocked by this function.
|
|
Error unregisterHostBuffer(void *HstPtr);
|
|
|
|
/// Lock the host buffer at \p HstPtr or register a new user if it intersects
|
|
/// with an already existing one. A partial overlapping with extension is not
|
|
/// allowed. The function returns the device accessible pointer of the pinned
|
|
/// buffer. The buffer must be unlocked using the unlockHostBuffer function.
|
|
Expected<void *> lockHostBuffer(void *HstPtr, size_t Size);
|
|
|
|
/// Unlock the host buffer at \p HstPtr or unregister a user if other users
|
|
/// are still using the pinned allocation. If this was the last user, the
|
|
/// pinned allocation is removed from the map and the memory is unlocked.
|
|
Error unlockHostBuffer(void *HstPtr);
|
|
|
|
/// Lock or register a host buffer that was recently mapped by libomptarget.
|
|
/// This behavior is applied if LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS is
|
|
/// enabled. Even if not enabled, externally locked buffers are registered
|
|
/// in order to optimize their transfers.
|
|
Error lockMappedHostBuffer(void *HstPtr, size_t Size);
|
|
|
|
/// Unlock or unregister a host buffer that was unmapped by libomptarget.
|
|
Error unlockUnmappedHostBuffer(void *HstPtr);
|
|
|
|
/// Return the device accessible pointer associated to the host pinned
|
|
/// allocation which the \p HstPtr belongs, if any. Return null in case the
|
|
/// \p HstPtr does not belong to any host pinned allocation. The device
|
|
/// accessible pointer is the one that devices should use for data transfers
|
|
/// that involve a host pinned buffer.
|
|
void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const {
|
|
std::shared_lock<std::shared_mutex> Lock(Mutex);
|
|
|
|
// Find the intersecting allocation if any.
|
|
const EntryTy *Entry = findIntersecting(HstPtr);
|
|
if (!Entry)
|
|
return nullptr;
|
|
|
|
return utils::advancePtr(Entry->DevAccessiblePtr,
|
|
utils::getPtrDiff(HstPtr, Entry->HstPtr));
|
|
}
|
|
|
|
/// Check whether a buffer belongs to a registered host pinned allocation.
|
|
bool isHostPinnedBuffer(const void *HstPtr) const {
|
|
std::shared_lock<std::shared_mutex> Lock(Mutex);
|
|
|
|
// Return whether there is an intersecting allocation.
|
|
return (findIntersecting(const_cast<void *>(HstPtr)) != nullptr);
|
|
}
|
|
};
|
|
|
|
/// Class implementing common functionalities of offload devices. Each plugin
|
|
/// should define the specific device class, derive from this generic one, and
|
|
/// implement the necessary virtual function members.
|
|
struct GenericDeviceTy : public DeviceAllocatorTy {
|
|
/// Construct a device with its device id within the plugin, the number of
|
|
/// devices in the plugin and the grid values for that kind of device.
|
|
GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
|
|
const llvm::omp::GV &GridValues);
|
|
|
|
/// Get the device identifier within the corresponding plugin. Notice that
|
|
/// this id is not unique between different plugins; they may overlap.
|
|
int32_t getDeviceId() const { return DeviceId; }
|
|
|
|
/// Set the context of the device if needed, before calling device-specific
|
|
/// functions. Plugins may implement this function as a no-op if not needed.
|
|
virtual Error setContext() = 0;
|
|
|
|
/// Initialize the device. After this call, the device should be already
|
|
/// working and ready to accept queries or modifications.
|
|
Error init(GenericPluginTy &Plugin);
|
|
virtual Error initImpl(GenericPluginTy &Plugin) = 0;
|
|
|
|
/// Deinitialize the device and free all its resources. After this call, the
|
|
/// device is no longer considered ready, so no queries or modifications are
|
|
/// allowed.
|
|
Error deinit(GenericPluginTy &Plugin);
|
|
virtual Error deinitImpl() = 0;
|
|
|
|
/// Load the binary image into the device and return the target table.
|
|
Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
|
|
const __tgt_device_image *TgtImage);
|
|
virtual Expected<DeviceImageTy *>
|
|
loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
|
|
|
|
/// Setup the device environment if needed. Notice this setup may not be run
|
|
/// on some plugins. By default, it will be executed, but plugins can change
|
|
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
|
|
Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
|
|
|
|
/// Setup the global device memory pool, if the plugin requires one.
|
|
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
|
|
uint64_t PoolSize);
|
|
|
|
// Setup the RPC server for this device if needed. This may not run on some
|
|
// plugins like the CPU targets. By default, it will not be executed so it is
|
|
// up to the target to override this using the shouldSetupRPCServer function.
|
|
Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
|
|
|
|
/// Synchronize the current thread with the pending operations on the
|
|
/// __tgt_async_info structure.
|
|
Error synchronize(__tgt_async_info *AsyncInfo);
|
|
virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
|
|
|
|
/// Invokes any global constructors on the device if present and is required
|
|
/// by the target.
|
|
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
|
|
DeviceImageTy &Image) {
|
|
return Error::success();
|
|
}
|
|
|
|
/// Invokes any global destructors on the device if present and is required
|
|
/// by the target.
|
|
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
|
|
DeviceImageTy &Image) {
|
|
return Error::success();
|
|
}
|
|
|
|
/// Query for the completion of the pending operations on the __tgt_async_info
|
|
/// structure in a non-blocking manner.
|
|
Error queryAsync(__tgt_async_info *AsyncInfo);
|
|
virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
|
|
|
|
/// Check whether the architecture supports VA management
|
|
virtual bool supportVAManagement() const { return false; }
|
|
|
|
/// Get the total device memory size
|
|
virtual Error getDeviceMemorySize(uint64_t &DSize);
|
|
|
|
/// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
|
|
/// map it to \p VAddr. The obtained address is stored in \p Addr. At return
|
|
/// \p RSize contains the actual size which can be equal or larger than the
|
|
/// requested size.
|
|
virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
|
|
|
|
/// De-allocates device memory and unmaps the virtual address \p VAddr
|
|
virtual Error memoryVAUnMap(void *VAddr, size_t Size);
|
|
|
|
/// Allocate data on the device or involving the device.
|
|
Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
|
|
|
|
/// Deallocate data from the device or involving the device.
|
|
Error dataDelete(void *TgtPtr, TargetAllocTy Kind);
|
|
|
|
/// Pin host memory to optimize transfers and return the device accessible
|
|
/// pointer that devices should use for memory transfers involving the host
|
|
/// pinned allocation.
|
|
Expected<void *> dataLock(void *HstPtr, int64_t Size) {
|
|
return PinnedAllocs.lockHostBuffer(HstPtr, Size);
|
|
}
|
|
|
|
/// Unpin a host memory buffer that was previously pinned.
|
|
Error dataUnlock(void *HstPtr) {
|
|
return PinnedAllocs.unlockHostBuffer(HstPtr);
|
|
}
|
|
|
|
/// Lock the host buffer \p HstPtr with \p Size bytes with the vendor-specific
|
|
/// API and return the device accessible pointer.
|
|
virtual Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) = 0;
|
|
|
|
/// Unlock a previously locked host buffer starting at \p HstPtr.
|
|
virtual Error dataUnlockImpl(void *HstPtr) = 0;
|
|
|
|
/// Mark the host buffer with address \p HstPtr and \p Size bytes as a mapped
|
|
/// buffer. This means that libomptarget created a new mapping of that host
|
|
/// buffer (e.g., because a user OpenMP target map) and the buffer may be used
|
|
/// as source/destination of memory transfers. We can use this information to
|
|
/// lock the host buffer and optimize its memory transfers.
|
|
Error notifyDataMapped(void *HstPtr, int64_t Size) {
|
|
return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size);
|
|
}
|
|
|
|
/// Mark the host buffer with address \p HstPtr as unmapped. This means that
|
|
/// libomptarget removed an existing mapping. If the plugin locked the buffer
|
|
/// in notifyDataMapped, this function should unlock it.
|
|
Error notifyDataUnmapped(void *HstPtr) {
|
|
return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr);
|
|
}
|
|
|
|
/// Check whether the host buffer with address \p HstPtr is pinned by the
|
|
/// underlying vendor-specific runtime (if any). Retrieve the host pointer,
|
|
/// the device accessible pointer and the size of the original pinned buffer.
|
|
virtual Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
|
|
void *&BaseDevAccessiblePtr,
|
|
size_t &BaseSize) const = 0;
|
|
|
|
/// Submit data to the device (host to device transfer).
|
|
Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
|
|
__tgt_async_info *AsyncInfo);
|
|
virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
|
|
|
/// Retrieve data from the device (device to host transfer).
|
|
Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
|
|
__tgt_async_info *AsyncInfo);
|
|
virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
|
|
|
/// Exchange data between devices (device to device transfer). Calling this
|
|
/// function is only valid if GenericPlugin::isDataExchangable() passing the
|
|
/// two devices returns true.
|
|
Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr,
|
|
int64_t Size, __tgt_async_info *AsyncInfo);
|
|
virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
|
|
void *DstPtr, int64_t Size,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
|
|
|
/// Run the kernel associated with \p EntryPtr
|
|
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
|
|
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
|
|
|
|
/// Initialize a __tgt_async_info structure. Related to interop features.
|
|
Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
|
|
virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
|
|
|
/// Initialize a __tgt_device_info structure. Related to interop features.
|
|
Error initDeviceInfo(__tgt_device_info *DeviceInfo);
|
|
virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
|
|
|
|
/// Create an event.
|
|
Error createEvent(void **EventPtrStorage);
|
|
virtual Error createEventImpl(void **EventPtrStorage) = 0;
|
|
|
|
/// Destroy an event.
|
|
Error destroyEvent(void *Event);
|
|
virtual Error destroyEventImpl(void *EventPtr) = 0;
|
|
|
|
/// Start the recording of the event.
|
|
Error recordEvent(void *Event, __tgt_async_info *AsyncInfo);
|
|
virtual Error recordEventImpl(void *EventPtr,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
|
|
|
/// Wait for an event to finish. Notice this wait is asynchronous if the
|
|
/// __tgt_async_info is not nullptr.
|
|
Error waitEvent(void *Event, __tgt_async_info *AsyncInfo);
|
|
virtual Error waitEventImpl(void *EventPtr,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
|
|
|
/// Synchronize the current thread with the event.
|
|
Error syncEvent(void *EventPtr);
|
|
virtual Error syncEventImpl(void *EventPtr) = 0;
|
|
|
|
/// Print information about the device.
|
|
Error printInfo();
|
|
virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
|
|
|
|
/// Getters of the grid values.
|
|
uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
|
|
uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; }
|
|
uint32_t getBlockLimit() const { return GridValues.GV_Max_Teams; }
|
|
uint32_t getDefaultNumThreads() const {
|
|
return GridValues.GV_Default_WG_Size;
|
|
}
|
|
uint32_t getDefaultNumBlocks() const {
|
|
return GridValues.GV_Default_Num_Teams;
|
|
}
|
|
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
|
|
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
|
|
|
|
/// Get target compute unit kind (e.g., sm_80, or gfx908).
|
|
virtual std::string getComputeUnitKind() const { return "unknown"; }
|
|
|
|
/// Post processing after jit backend. The ownership of \p MB will be taken.
|
|
virtual Expected<std::unique_ptr<MemoryBuffer>>
|
|
doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const {
|
|
return std::move(MB);
|
|
}
|
|
|
|
/// The minimum number of threads we use for a low-trip count combined loop.
|
|
/// Instead of using more threads we increase the outer (block/team)
|
|
/// parallelism.
|
|
/// @see OMPX_MinThreadsForLowTripCount
|
|
virtual uint32_t getMinThreadsForLowTripCountLoop() {
|
|
return OMPX_MinThreadsForLowTripCount;
|
|
}
|
|
|
|
/// Whether or not to reuse blocks for high trip count loops.
|
|
/// @see OMPX_ReuseBlocksForHighTripCount
|
|
bool getReuseBlocksForHighTripCount() {
|
|
return OMPX_ReuseBlocksForHighTripCount;
|
|
}
|
|
|
|
/// Get the total amount of hardware parallelism supported by the target
|
|
/// device. This is the total amount of warps or wavefronts that can be
|
|
/// resident on the device simultaneously.
|
|
virtual uint64_t getHardwareParallelism() const { return 0; }
|
|
|
|
/// Get the RPC server running on this device.
|
|
RPCServerTy *getRPCServer() const { return RPCServer; }
|
|
|
|
/// The number of parallel RPC ports to use on the device. In general, this
|
|
/// should be roughly equivalent to the amount of hardware parallelism the
|
|
/// device can support. This is because GPUs in general do not have forward
|
|
/// progress guarantees, so we minimize thread level dependencies by
|
|
/// allocating enough space such that each device thread can have a port. This
|
|
/// is likely overly pessimistic in the average case, but guarantees no
|
|
/// deadlocks at the cost of memory. This must be overloaded by targets
|
|
/// expecting to use the RPC server.
|
|
virtual uint64_t requestedRPCPortCount() const {
|
|
assert(!shouldSetupRPCServer() && "Default implementation cannot be used");
|
|
return 0;
|
|
}
|
|
|
|
virtual Error getDeviceStackSize(uint64_t &V) = 0;
|
|
|
|
/// Returns true if current plugin architecture is an APU
|
|
/// and unified_shared_memory was not requested by the program.
|
|
bool useAutoZeroCopy();
|
|
virtual bool useAutoZeroCopyImpl() { return false; }
|
|
|
|
/// Allocate and construct a kernel object.
|
|
virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
|
|
|
|
/// Reference to the underlying plugin that created this device.
|
|
GenericPluginTy &Plugin;
|
|
|
|
/// Map to record when allocations have been performed, and when they have
|
|
/// been deallocated, both for error reporting purposes.
|
|
ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
|
|
|
|
/// Return the allocation trace info for a device pointer, that is the
|
|
/// allocation into which this device pointer points to (or pointed into).
|
|
AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) {
|
|
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
|
|
for (auto &It : *AllocationTraceMap) {
|
|
if (It.first <= DevicePtr &&
|
|
utils::advancePtr(It.first, It.second->Size) > DevicePtr)
|
|
return It.second;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
/// Return the allocation trace info for a device pointer, that is the
|
|
/// allocation into which this device pointer points to (or pointed into).
|
|
AllocationTraceInfoTy *
|
|
getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) {
|
|
Distance = 0;
|
|
if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) {
|
|
return ATI;
|
|
}
|
|
|
|
AllocationTraceInfoTy *ATI = nullptr;
|
|
uintptr_t DevicePtrI = uintptr_t(DevicePtr);
|
|
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
|
|
for (auto &It : *AllocationTraceMap) {
|
|
uintptr_t Begin = uintptr_t(It.second->DevicePtr);
|
|
uintptr_t End = Begin + It.second->Size - 1;
|
|
uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End);
|
|
if (ATI && ItDistance > Distance)
|
|
continue;
|
|
ATI = It.second;
|
|
Distance = ItDistance;
|
|
}
|
|
return ATI;
|
|
}
|
|
|
|
/// Map to record kernel have been launchedl, for error reporting purposes.
|
|
ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;
|
|
|
|
/// Environment variable to determine if stack traces for kernel launches are
|
|
/// tracked.
|
|
UInt32Envar OMPX_TrackNumKernelLaunches =
|
|
UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
|
|
|
|
/// Environment variable to determine if stack traces for allocations and
|
|
/// deallocations are tracked.
|
|
BoolEnvar OMPX_TrackAllocationTraces =
|
|
BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
|
|
|
|
private:
|
|
/// Get and set the stack size and heap size for the device. If not used, the
|
|
/// plugin can implement the setters as no-op and setting the output
|
|
/// value to zero for the getters.
|
|
virtual Error setDeviceStackSize(uint64_t V) = 0;
|
|
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
|
|
virtual Error setDeviceHeapSize(uint64_t V) = 0;
|
|
|
|
/// Indicate whether the device should setup the device environment. Notice
|
|
/// that returning false in this function will change the behavior of the
|
|
/// setupDeviceEnvironment() function.
|
|
virtual bool shouldSetupDeviceEnvironment() const { return true; }
|
|
|
|
/// Indicate whether the device should setup the global device memory pool. If
|
|
/// false is return the value on the device will be uninitialized.
|
|
virtual bool shouldSetupDeviceMemoryPool() const { return true; }
|
|
|
|
/// Indicate whether or not the device should setup the RPC server. This is
|
|
/// only necessary for unhosted targets like the GPU.
|
|
virtual bool shouldSetupRPCServer() const { return false; }
|
|
|
|
/// Pointer to the memory manager or nullptr if not available.
|
|
MemoryManagerTy *MemoryManager;
|
|
|
|
/// Environment variables defined by the OpenMP standard.
|
|
Int32Envar OMP_TeamLimit;
|
|
Int32Envar OMP_NumTeams;
|
|
Int32Envar OMP_TeamsThreadLimit;
|
|
|
|
/// Environment variables defined by the LLVM OpenMP implementation.
|
|
Int32Envar OMPX_DebugKind;
|
|
UInt32Envar OMPX_SharedMemorySize;
|
|
UInt64Envar OMPX_TargetStackSize;
|
|
UInt64Envar OMPX_TargetHeapSize;
|
|
|
|
/// Environment flag to set the minimum number of threads we use for a
|
|
/// low-trip count combined loop. Instead of using more threads we increase
|
|
/// the outer (block/team) parallelism.
|
|
UInt32Envar OMPX_MinThreadsForLowTripCount =
|
|
UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
|
|
|
|
BoolEnvar OMPX_ReuseBlocksForHighTripCount =
|
|
BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true);
|
|
|
|
protected:
|
|
/// Environment variables defined by the LLVM OpenMP implementation
|
|
/// regarding the initial number of streams and events.
|
|
UInt32Envar OMPX_InitialNumStreams;
|
|
UInt32Envar OMPX_InitialNumEvents;
|
|
|
|
/// Array of images loaded into the device. Images are automatically
|
|
/// deallocated by the allocator.
|
|
llvm::SmallVector<DeviceImageTy *> LoadedImages;
|
|
|
|
/// The identifier of the device within the plugin. Notice this is not a
|
|
/// global device id and is not the device id visible to the OpenMP user.
|
|
const int32_t DeviceId;
|
|
|
|
/// The default grid values used for this device.
|
|
llvm::omp::GV GridValues;
|
|
|
|
/// Enumeration used for representing the current state between two devices
|
|
/// two devices (both under the same plugin) for the peer access between them.
|
|
/// The states can be a) PENDING when the state has not been queried and needs
|
|
/// to be queried, b) AVAILABLE when the peer access is available to be used,
|
|
/// and c) UNAVAILABLE if the system does not allow it.
|
|
enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
|
|
|
|
/// Array of peer access states with the rest of devices. This means that if
|
|
/// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
|
|
/// the device I can access device J's memory directly. However, notice this
|
|
/// does not mean that device J can access device I's memory directly.
|
|
llvm::SmallVector<PeerAccessState> PeerAccesses;
|
|
std::mutex PeerAccessesLock;
|
|
|
|
/// Map of host pinned allocations used for optimize device transfers.
|
|
PinnedAllocationMapTy PinnedAllocs;
|
|
|
|
/// A pointer to an RPC server instance attached to this device if present.
|
|
/// This is used to run the RPC server during task synchronization.
|
|
RPCServerTy *RPCServer;
|
|
|
|
#ifdef OMPT_SUPPORT
|
|
/// OMPT callback functions
|
|
#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr;
|
|
FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback)
|
|
#undef defineOmptCallback
|
|
|
|
/// Internal representation for OMPT device (initialize & finalize)
|
|
std::atomic<bool> OmptInitialized;
|
|
#endif
|
|
|
|
private:
|
|
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
|
|
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
|
|
};
|
|
|
|
/// Class implementing common functionalities of offload plugins. Each plugin
|
|
/// should define the specific plugin class, derive from this generic one, and
|
|
/// implement the necessary virtual function members.
|
|
struct GenericPluginTy {
|
|
|
|
/// Construct a plugin instance.
|
|
GenericPluginTy(Triple::ArchType TA)
|
|
: GlobalHandler(nullptr), JIT(TA), RPCServer(nullptr),
|
|
RecordReplay(nullptr) {}
|
|
|
|
virtual ~GenericPluginTy() {}
|
|
|
|
/// Initialize the plugin.
|
|
Error init();
|
|
|
|
/// Initialize the plugin and return the number of available devices.
|
|
virtual Expected<int32_t> initImpl() = 0;
|
|
|
|
/// Deinitialize the plugin and release the resources.
|
|
Error deinit();
|
|
virtual Error deinitImpl() = 0;
|
|
|
|
/// Create a new device for the underlying plugin.
|
|
virtual GenericDeviceTy *createDevice(GenericPluginTy &Plugin,
|
|
int32_t DeviceID,
|
|
int32_t NumDevices) = 0;
|
|
|
|
/// Create a new global handler for the underlying plugin.
|
|
virtual GenericGlobalHandlerTy *createGlobalHandler() = 0;
|
|
|
|
/// Get the reference to the device with a certain device id.
|
|
GenericDeviceTy &getDevice(int32_t DeviceId) {
|
|
assert(isValidDeviceId(DeviceId) && "Invalid device id");
|
|
assert(Devices[DeviceId] && "Device is uninitialized");
|
|
|
|
return *Devices[DeviceId];
|
|
}
|
|
|
|
/// Get the number of active devices.
|
|
int32_t getNumDevices() const { return NumDevices; }
|
|
|
|
/// Get the plugin-specific device identifier.
|
|
int32_t getUserId(int32_t DeviceId) const {
|
|
assert(UserDeviceIds.contains(DeviceId) && "No user-id registered");
|
|
return UserDeviceIds.at(DeviceId);
|
|
}
|
|
|
|
/// Get the ELF code to recognize the binary image of this plugin.
|
|
virtual uint16_t getMagicElfBits() const = 0;
|
|
|
|
/// Get the target triple of this plugin.
|
|
virtual Triple::ArchType getTripleArch() const = 0;
|
|
|
|
/// Get the constant name identifier for this plugin.
|
|
virtual const char *getName() const = 0;
|
|
|
|
/// Allocate a structure using the internal allocator.
|
|
template <typename Ty> Ty *allocate() {
|
|
return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
|
|
}
|
|
|
|
/// Get the reference to the global handler of this plugin.
|
|
GenericGlobalHandlerTy &getGlobalHandler() {
|
|
assert(GlobalHandler && "Global handler not initialized");
|
|
return *GlobalHandler;
|
|
}
|
|
|
|
/// Get the reference to the JIT used for all devices connected to this
|
|
/// plugin.
|
|
JITEngine &getJIT() { return JIT; }
|
|
|
|
/// Get a reference to the RPC server used to provide host services.
|
|
RPCServerTy &getRPCServer() {
|
|
assert(RPCServer && "RPC server not initialized");
|
|
return *RPCServer;
|
|
}
|
|
|
|
/// Get a reference to the record and replay interface for the plugin.
|
|
RecordReplayTy &getRecordReplay() {
|
|
assert(RecordReplay && "RR interface not initialized");
|
|
return *RecordReplay;
|
|
}
|
|
|
|
/// Initialize a device within the plugin.
|
|
Error initDevice(int32_t DeviceId);
|
|
|
|
/// Deinitialize a device within the plugin and release its resources.
|
|
Error deinitDevice(int32_t DeviceId);
|
|
|
|
/// Indicate whether data can be exchanged directly between two devices under
|
|
/// this same plugin. If this function returns true, it's safe to call the
|
|
/// GenericDeviceTy::exchangeData() function on the source device.
|
|
virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) {
|
|
return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId);
|
|
}
|
|
|
|
/// Top level interface to verify if a given ELF image can be executed on a
|
|
/// given target. Returns true if the \p Image is compatible with the plugin.
|
|
Expected<bool> checkELFImage(StringRef Image) const;
|
|
|
|
/// Return true if the \p Image can be compiled to run on the platform's
|
|
/// target architecture.
|
|
Expected<bool> checkBitcodeImage(StringRef Image) const;
|
|
|
|
/// Indicate if an image is compatible with the plugin devices. Notice that
|
|
/// this function may be called before actually initializing the devices. So
|
|
/// we could not move this function into GenericDeviceTy.
|
|
virtual Expected<bool> isELFCompatible(uint32_t DeviceID,
|
|
StringRef Image) const = 0;
|
|
|
|
protected:
|
|
/// Indicate whether a device id is valid.
|
|
bool isValidDeviceId(int32_t DeviceId) const {
|
|
return (DeviceId >= 0 && DeviceId < getNumDevices());
|
|
}
|
|
|
|
public:
|
|
// TODO: This plugin interface needs to be cleaned up.
|
|
|
|
/// Returns non-zero if the plugin runtime has been initialized.
|
|
int32_t is_initialized() const;
|
|
|
|
/// Returns non-zero if the \p Image is compatible with the plugin. This
|
|
/// function does not require the plugin to be initialized before use.
|
|
int32_t is_plugin_compatible(__tgt_device_image *Image);
|
|
|
|
/// Returns non-zero if the \p Image is compatible with the device.
|
|
int32_t is_device_compatible(int32_t DeviceId, __tgt_device_image *Image);
|
|
|
|
/// Returns non-zero if the plugin device has been initialized.
|
|
int32_t is_device_initialized(int32_t DeviceId) const;
|
|
|
|
/// Initialize the device inside of the plugin.
|
|
int32_t init_device(int32_t DeviceId);
|
|
|
|
/// Return the number of devices this plugin can support.
|
|
int32_t number_of_devices();
|
|
|
|
/// Returns non-zero if the data can be exchanged between the two devices.
|
|
int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId);
|
|
|
|
/// Initializes the record and replay mechanism inside the plugin.
|
|
int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
|
|
void *VAddr, bool isRecord, bool SaveOutput,
|
|
uint64_t &ReqPtrArgOffset);
|
|
|
|
/// Loads the associated binary into the plugin and returns a handle to it.
|
|
int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
|
|
__tgt_device_binary *Binary);
|
|
|
|
/// Allocates memory that is accessively to the given device.
|
|
void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, int32_t Kind);
|
|
|
|
/// Deallocates memory on the given device.
|
|
int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind);
|
|
|
|
/// Locks / pins host memory using the plugin runtime.
|
|
int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
|
|
void **LockedPtr);
|
|
|
|
/// Unlocks / unpins host memory using the plugin runtime.
|
|
int32_t data_unlock(int32_t DeviceId, void *Ptr);
|
|
|
|
/// Notify the runtime about a new mapping that has been created outside.
|
|
int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size);
|
|
|
|
/// Notify t he runtime about a mapping that has been deleted.
|
|
int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
|
|
|
|
/// Copy data to the given device.
|
|
int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
|
|
int64_t Size);
|
|
|
|
/// Copy data to the given device asynchronously.
|
|
int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr,
|
|
int64_t Size, __tgt_async_info *AsyncInfoPtr);
|
|
|
|
/// Copy data from the given device.
|
|
int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
|
|
int64_t Size);
|
|
|
|
/// Copy data from the given device asynchronously.
|
|
int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr,
|
|
int64_t Size, __tgt_async_info *AsyncInfoPtr);
|
|
|
|
/// Exchange memory addresses between two devices.
|
|
int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr, int32_t DstDeviceId,
|
|
void *DstPtr, int64_t Size);
|
|
|
|
/// Exchange memory addresses between two devices asynchronously.
|
|
int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
|
|
int DstDeviceId, void *DstPtr, int64_t Size,
|
|
__tgt_async_info *AsyncInfo);
|
|
|
|
/// Begin executing a kernel on the given device.
|
|
int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
|
|
ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
|
|
__tgt_async_info *AsyncInfoPtr);
|
|
|
|
/// Synchronize an asyncrhonous queue with the plugin runtime.
|
|
int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
|
|
|
|
/// Query the current state of an asynchronous queue.
|
|
int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
|
|
|
|
/// Prints information about the given devices supported by the plugin.
|
|
void print_device_info(int32_t DeviceId);
|
|
|
|
/// Creates an event in the given plugin if supported.
|
|
int32_t create_event(int32_t DeviceId, void **EventPtr);
|
|
|
|
/// Records an event that has occurred.
|
|
int32_t record_event(int32_t DeviceId, void *EventPtr,
|
|
__tgt_async_info *AsyncInfoPtr);
|
|
|
|
/// Wait until an event has occurred.
|
|
int32_t wait_event(int32_t DeviceId, void *EventPtr,
|
|
__tgt_async_info *AsyncInfoPtr);
|
|
|
|
/// Synchronize execution until an event is done.
|
|
int32_t sync_event(int32_t DeviceId, void *EventPtr);
|
|
|
|
/// Remove the event from the plugin.
|
|
int32_t destroy_event(int32_t DeviceId, void *EventPtr);
|
|
|
|
/// Remove the event from the plugin.
|
|
void set_info_flag(uint32_t NewInfoLevel);
|
|
|
|
/// Creates an asynchronous queue for the given plugin.
|
|
int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr);
|
|
|
|
/// Creates device information to be used for diagnostics.
|
|
int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo,
|
|
const char **ErrStr);
|
|
|
|
/// Sets the offset into the devices for use by OMPT.
|
|
int32_t set_device_identifier(int32_t UserId, int32_t DeviceId);
|
|
|
|
/// Returns if the plugin can support automatic copy.
|
|
int32_t use_auto_zero_copy(int32_t DeviceId);
|
|
|
|
/// Look up a global symbol in the given binary.
|
|
int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
|
|
const char *Name, void **DevicePtr);
|
|
|
|
/// Look up a kernel function in the given binary.
|
|
int32_t get_function(__tgt_device_binary Binary, const char *Name,
|
|
void **KernelPtr);
|
|
|
|
private:
|
|
/// Indicates if the platform runtime has been fully initialized.
|
|
bool Initialized = false;
|
|
|
|
/// Number of devices available for the plugin.
|
|
int32_t NumDevices = 0;
|
|
|
|
/// Map of plugin device identifiers to the user device identifier.
|
|
llvm::DenseMap<int32_t, int32_t> UserDeviceIds;
|
|
|
|
/// Array of pointers to the devices. Initially, they are all set to nullptr.
|
|
/// Once a device is initialized, the pointer is stored in the position given
|
|
/// by its device id. A position with nullptr means that the corresponding
|
|
/// device was not initialized yet.
|
|
llvm::SmallVector<GenericDeviceTy *> Devices;
|
|
|
|
/// Pointer to the global handler for this plugin.
|
|
GenericGlobalHandlerTy *GlobalHandler;
|
|
|
|
/// Internal allocator for different structures.
|
|
BumpPtrAllocator Allocator;
|
|
|
|
/// The JIT engine shared by all devices connected to this plugin.
|
|
JITEngine JIT;
|
|
|
|
/// The interface between the plugin and the GPU for host services.
|
|
RPCServerTy *RPCServer;
|
|
|
|
/// The interface between the plugin and the GPU for host services.
|
|
RecordReplayTy *RecordReplay;
|
|
};
|
|
|
|
namespace Plugin {
|
|
/// Create a success error. This is the same as calling Error::success(), but
|
|
/// it is recommended to use this one for consistency with Plugin::error() and
|
|
/// Plugin::check().
|
|
static inline Error success() { return Error::success(); }
|
|
|
|
/// Create an Offload error.
|
|
template <typename... ArgsTy>
|
|
static Error error(error::ErrorCode Code, const char *ErrFmt, ArgsTy... Args) {
|
|
return error::createOffloadError(Code, ErrFmt, Args...);
|
|
}
|
|
|
|
inline Error error(error::ErrorCode Code, const char *S) {
|
|
return make_error<error::OffloadError>(Code, S);
|
|
}
|
|
|
|
inline Error error(error::ErrorCode Code, Error &&OtherError,
|
|
const char *Context) {
|
|
return error::createOffloadError(Code, std::move(OtherError), Context);
|
|
}
|
|
|
|
/// Check the plugin-specific error code and return an error or success
|
|
/// accordingly. In case of an error, create a string error with the error
|
|
/// description. The ErrFmt should follow the format:
|
|
/// "Error in <function name>[<optional info>]: %s"
|
|
/// The last format specifier "%s" is mandatory and will be used to place the
|
|
/// error code's description. Notice this function should be only called from
|
|
/// the plugin-specific code.
|
|
/// TODO: Refactor this, must be defined individually by each plugin.
|
|
template <typename... ArgsTy>
|
|
static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args);
|
|
} // namespace Plugin
|
|
|
|
/// Auxiliary interface class for GenericDeviceResourceManagerTy. This class
|
|
/// acts as a reference to a device resource, such as a stream, and requires
|
|
/// some basic functions to be implemented. The derived class should define an
|
|
/// empty constructor that creates an empty and invalid resource reference. Do
|
|
/// not create a new resource on the ctor, but on the create() function instead.
|
|
///
|
|
/// The derived class should also define the type HandleTy as the underlying
|
|
/// resource handle type. For instance, in a CUDA stream it would be:
|
|
/// using HandleTy = CUstream;
|
|
struct GenericDeviceResourceRef {
|
|
/// Create a new resource and stores a reference.
|
|
virtual Error create(GenericDeviceTy &Device) = 0;
|
|
|
|
/// Destroy and release the resources pointed by the reference.
|
|
virtual Error destroy(GenericDeviceTy &Device) = 0;
|
|
|
|
protected:
|
|
~GenericDeviceResourceRef() = default;
|
|
};
|
|
|
|
/// Class that implements a resource pool belonging to a device. This class
|
|
/// operates with references to the actual resources. These reference must
|
|
/// derive from the GenericDeviceResourceRef class and implement the create
|
|
/// and destroy virtual functions.
|
|
template <typename ResourceRef> class GenericDeviceResourceManagerTy {
|
|
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
|
|
using ResourceHandleTy = typename ResourceRef::HandleTy;
|
|
|
|
public:
|
|
/// Create an empty resource pool for a specific device.
|
|
GenericDeviceResourceManagerTy(GenericDeviceTy &Device)
|
|
: Device(Device), NextAvailable(0) {}
|
|
|
|
/// Destroy the resource pool. At this point, the deinit() function should
|
|
/// already have been executed so the resource pool should be empty.
|
|
virtual ~GenericDeviceResourceManagerTy() {
|
|
assert(ResourcePool.empty() && "Resource pool not empty");
|
|
}
|
|
|
|
/// Initialize the resource pool.
|
|
Error init(uint32_t InitialSize) {
|
|
assert(ResourcePool.empty() && "Resource pool already initialized");
|
|
return ResourcePoolTy::resizeResourcePool(InitialSize);
|
|
}
|
|
|
|
/// Deinitialize the resource pool and delete all resources. This function
|
|
/// must be called before the destructor.
|
|
virtual Error deinit() {
|
|
if (NextAvailable)
|
|
DP("Missing %d resources to be returned\n", NextAvailable);
|
|
|
|
// TODO: This prevents a bug on libomptarget to make the plugins fail. There
|
|
// may be some resources not returned. Do not destroy these ones.
|
|
if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable))
|
|
return Err;
|
|
|
|
ResourcePool.clear();
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Get a resource from the pool or create new ones. If the function
|
|
/// succeeds, the handle to the resource is saved in \p Handle.
|
|
virtual Error getResource(ResourceHandleTy &Handle) {
|
|
// Get a resource with an empty resource processor.
|
|
return getResourcesImpl(1, &Handle,
|
|
[](ResourceHandleTy) { return Plugin::success(); });
|
|
}
|
|
|
|
/// Get multiple resources from the pool or create new ones. If the function
|
|
/// succeeds, the handles to the resources are saved in \p Handles.
|
|
virtual Error getResources(uint32_t Num, ResourceHandleTy *Handles) {
|
|
// Get resources with an empty resource processor.
|
|
return getResourcesImpl(Num, Handles,
|
|
[](ResourceHandleTy) { return Plugin::success(); });
|
|
}
|
|
|
|
/// Return resource to the pool.
|
|
virtual Error returnResource(ResourceHandleTy Handle) {
|
|
// Return a resource with an empty resource processor.
|
|
return returnResourceImpl(
|
|
Handle, [](ResourceHandleTy) { return Plugin::success(); });
|
|
}
|
|
|
|
protected:
|
|
/// Get multiple resources from the pool or create new ones. If the function
|
|
/// succeeds, the handles to the resources are saved in \p Handles. Also
|
|
/// process each of the obtained resources with \p Processor.
|
|
template <typename FuncTy>
|
|
Error getResourcesImpl(uint32_t Num, ResourceHandleTy *Handles,
|
|
FuncTy Processor) {
|
|
const std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
assert(NextAvailable <= ResourcePool.size() &&
|
|
"Resource pool is corrupted");
|
|
|
|
if (NextAvailable + Num > ResourcePool.size())
|
|
// Double the resource pool or resize it to provide the requested ones.
|
|
if (auto Err = ResourcePoolTy::resizeResourcePool(
|
|
std::max(NextAvailable * 2, NextAvailable + Num)))
|
|
return Err;
|
|
|
|
// Save the handles in the output array parameter.
|
|
for (uint32_t r = 0; r < Num; ++r)
|
|
Handles[r] = ResourcePool[NextAvailable + r];
|
|
|
|
// Process all obtained resources.
|
|
for (uint32_t r = 0; r < Num; ++r)
|
|
if (auto Err = Processor(Handles[r]))
|
|
return Err;
|
|
|
|
NextAvailable += Num;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Return resource to the pool and process the resource with \p Processor.
|
|
template <typename FuncTy>
|
|
Error returnResourceImpl(ResourceHandleTy Handle, FuncTy Processor) {
|
|
const std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
// Process the returned resource.
|
|
if (auto Err = Processor(Handle))
|
|
return Err;
|
|
|
|
assert(NextAvailable > 0 && "Resource pool is corrupted");
|
|
ResourcePool[--NextAvailable] = Handle;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
protected:
|
|
/// The resources between \p OldSize and \p NewSize need to be created or
|
|
/// destroyed. The mutex is locked when this function is called.
|
|
Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) {
|
|
assert(OldSize != NewSize && "Resizing to the same size");
|
|
|
|
if (auto Err = Device.setContext())
|
|
return Err;
|
|
|
|
if (OldSize < NewSize) {
|
|
// Create new resources.
|
|
for (uint32_t I = OldSize; I < NewSize; ++I) {
|
|
if (auto Err = ResourcePool[I].create(Device))
|
|
return Err;
|
|
}
|
|
} else {
|
|
// Destroy the obsolete resources.
|
|
for (uint32_t I = NewSize; I < OldSize; ++I) {
|
|
if (auto Err = ResourcePool[I].destroy(Device))
|
|
return Err;
|
|
}
|
|
}
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Increase or decrease the number of resources. This function should
|
|
/// be called with the mutex acquired.
|
|
Error resizeResourcePool(uint32_t NewSize) {
|
|
uint32_t OldSize = ResourcePool.size();
|
|
|
|
// Nothing to do.
|
|
if (OldSize == NewSize)
|
|
return Plugin::success();
|
|
|
|
if (OldSize < NewSize) {
|
|
// Increase the number of resources.
|
|
ResourcePool.resize(NewSize);
|
|
return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
|
|
}
|
|
|
|
// Decrease the number of resources otherwise.
|
|
auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
|
|
ResourcePool.resize(NewSize);
|
|
|
|
return Err;
|
|
}
|
|
|
|
/// The device to which the resources belong
|
|
GenericDeviceTy &Device;
|
|
|
|
/// Mutex for the resource pool.
|
|
std::mutex Mutex;
|
|
|
|
/// The next available resource in the pool.
|
|
uint32_t NextAvailable;
|
|
|
|
/// The actual resource pool.
|
|
std::deque<ResourceRef> ResourcePool;
|
|
};
|
|
|
|
} // namespace plugin
|
|
} // namespace target
|
|
} // namespace omp
|
|
} // namespace llvm
|
|
|
|
#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H
|