[flang][cuda] Add async id to allocators (#134724)
Add async id to allocators in preparation for stream allocation.
This commit is contained in:
committed by
GitHub
parent
7117dea043
commit
5ebe22a35d
@@ -11,6 +11,7 @@
|
||||
|
||||
#include "flang/Common/api-attrs.h"
|
||||
#include "flang/Runtime/allocator-registry-consts.h"
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
@@ -18,7 +19,7 @@
|
||||
|
||||
namespace Fortran::runtime {
|
||||
|
||||
using AllocFct = void *(*)(std::size_t);
|
||||
using AllocFct = void *(*)(std::size_t, std::int64_t);
|
||||
using FreeFct = void (*)(void *);
|
||||
|
||||
typedef struct Allocator_t {
|
||||
@@ -26,10 +27,11 @@ typedef struct Allocator_t {
|
||||
FreeFct free{nullptr};
|
||||
} Allocator_t;
|
||||
|
||||
#ifdef RT_DEVICE_COMPILATION
|
||||
static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
|
||||
static RT_API_ATTRS void *MallocWrapper(
|
||||
std::size_t size, [[maybe_unused]] std::int64_t) {
|
||||
return std::malloc(size);
|
||||
}
|
||||
#ifdef RT_DEVICE_COMPILATION
|
||||
static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
|
||||
#endif
|
||||
|
||||
@@ -39,7 +41,7 @@ struct AllocatorRegistry {
|
||||
: allocators{{&MallocWrapper, &FreeWrapper}} {}
|
||||
#else
|
||||
constexpr AllocatorRegistry() {
|
||||
allocators[kDefaultAllocator] = {&std::malloc, &std::free};
|
||||
allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
|
||||
};
|
||||
#endif
|
||||
RT_API_ATTRS void Register(int, Allocator_t);
|
||||
|
||||
@@ -34,7 +34,8 @@ void RTDEF(CUFRegisterAllocator)() {
|
||||
}
|
||||
}
|
||||
|
||||
void *CUFAllocPinned(std::size_t sizeInBytes) {
|
||||
void *CUFAllocPinned(
|
||||
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
|
||||
void *p;
|
||||
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
|
||||
return p;
|
||||
@@ -42,7 +43,8 @@ void *CUFAllocPinned(std::size_t sizeInBytes) {
|
||||
|
||||
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
|
||||
|
||||
void *CUFAllocDevice(std::size_t sizeInBytes) {
|
||||
void *CUFAllocDevice(
|
||||
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
|
||||
void *p;
|
||||
if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
|
||||
CUDA_REPORT_IF_ERROR(
|
||||
@@ -55,7 +57,8 @@ void *CUFAllocDevice(std::size_t sizeInBytes) {
|
||||
|
||||
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
|
||||
|
||||
void *CUFAllocManaged(std::size_t sizeInBytes) {
|
||||
void *CUFAllocManaged(
|
||||
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
|
||||
void *p;
|
||||
CUDA_REPORT_IF_ERROR(
|
||||
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
|
||||
@@ -64,9 +67,10 @@ void *CUFAllocManaged(std::size_t sizeInBytes) {
|
||||
|
||||
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
|
||||
|
||||
void *CUFAllocUnified(std::size_t sizeInBytes) {
|
||||
void *CUFAllocUnified(
|
||||
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
|
||||
// Call alloc managed for the time being.
|
||||
return CUFAllocManaged(sizeInBytes);
|
||||
return CUFAllocManaged(sizeInBytes, asyncId);
|
||||
}
|
||||
|
||||
void CUFFreeUnified(void *p) {
|
||||
|
||||
@@ -20,7 +20,8 @@ RT_EXT_API_GROUP_BEGIN
|
||||
|
||||
Descriptor *RTDEF(CUFAllocDescriptor)(
|
||||
std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
|
||||
return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
|
||||
return reinterpret_cast<Descriptor *>(
|
||||
CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
|
||||
}
|
||||
|
||||
void RTDEF(CUFFreeDescriptor)(
|
||||
|
||||
@@ -170,7 +170,7 @@ RT_API_ATTRS int Descriptor::Allocate() {
|
||||
// Zero size allocation is possible in Fortran and the resulting
|
||||
// descriptor must be allocated/associated. Since std::malloc(0)
|
||||
// result is implementation defined, always allocate at least one byte.
|
||||
void *p{alloc(byteSize ? byteSize : 1)};
|
||||
void *p{alloc(byteSize ? byteSize : 1, /*asyncId=*/-1)};
|
||||
if (!p) {
|
||||
return CFI_ERROR_MEM_ALLOCATION;
|
||||
}
|
||||
|
||||
@@ -129,7 +129,7 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
|
||||
byteSize = ((byteSize + align - 1) / align) * align;
|
||||
std::size_t total{byteSize + sizeof(std::uintptr_t)};
|
||||
AllocFct alloc{allocatorRegistry.GetAllocator(allocatorIdx)};
|
||||
void *p{alloc(total)};
|
||||
void *p{alloc(total, /*asyncId=*/-1)};
|
||||
if (p && allocatorIdx == 0) {
|
||||
// Fill the footer word with the XOR of the ones' complement of
|
||||
// the base address, which is a value that would be highly unlikely
|
||||
|
||||
@@ -20,16 +20,16 @@ extern "C" {
|
||||
void RTDECL(CUFRegisterAllocator)();
|
||||
}
|
||||
|
||||
void *CUFAllocPinned(std::size_t);
|
||||
void *CUFAllocPinned(std::size_t, std::int64_t);
|
||||
void CUFFreePinned(void *);
|
||||
|
||||
void *CUFAllocDevice(std::size_t);
|
||||
void *CUFAllocDevice(std::size_t, std::int64_t);
|
||||
void CUFFreeDevice(void *);
|
||||
|
||||
void *CUFAllocManaged(std::size_t);
|
||||
void *CUFAllocManaged(std::size_t, std::int64_t);
|
||||
void CUFFreeManaged(void *);
|
||||
|
||||
void *CUFAllocUnified(std::size_t);
|
||||
void *CUFAllocUnified(std::size_t, std::int64_t);
|
||||
void CUFFreeUnified(void *);
|
||||
|
||||
} // namespace Fortran::runtime::cuda
|
||||
|
||||
Reference in New Issue
Block a user