[flang][cuda] Add async id to allocators (#134724)

Add async id to allocators in preparation for stream allocation.
This commit is contained in:
Valentin Clement (バレンタイン クレメン)
2025-04-08 10:16:59 -07:00
committed by GitHub
parent 7117dea043
commit 5ebe22a35d
6 changed files with 23 additions and 16 deletions

View File

@@ -11,6 +11,7 @@
#include "flang/Common/api-attrs.h"
#include "flang/Runtime/allocator-registry-consts.h"
#include <cstdint>
#include <cstdlib>
#include <vector>
@@ -18,7 +19,7 @@
namespace Fortran::runtime {
using AllocFct = void *(*)(std::size_t);
using AllocFct = void *(*)(std::size_t, std::int64_t);
using FreeFct = void (*)(void *);
typedef struct Allocator_t {
@@ -26,10 +27,11 @@ typedef struct Allocator_t {
FreeFct free{nullptr};
} Allocator_t;
#ifdef RT_DEVICE_COMPILATION
static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
static RT_API_ATTRS void *MallocWrapper(
std::size_t size, [[maybe_unused]] std::int64_t) {
return std::malloc(size);
}
#ifdef RT_DEVICE_COMPILATION
static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
#endif
@@ -39,7 +41,7 @@ struct AllocatorRegistry {
: allocators{{&MallocWrapper, &FreeWrapper}} {}
#else
constexpr AllocatorRegistry() {
allocators[kDefaultAllocator] = {&std::malloc, &std::free};
allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
};
#endif
RT_API_ATTRS void Register(int, Allocator_t);

View File

@@ -34,7 +34,8 @@ void RTDEF(CUFRegisterAllocator)() {
}
}
void *CUFAllocPinned(std::size_t sizeInBytes) {
void *CUFAllocPinned(
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
return p;
@@ -42,7 +43,8 @@ void *CUFAllocPinned(std::size_t sizeInBytes) {
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
void *CUFAllocDevice(std::size_t sizeInBytes) {
void *CUFAllocDevice(
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
CUDA_REPORT_IF_ERROR(
@@ -55,7 +57,8 @@ void *CUFAllocDevice(std::size_t sizeInBytes) {
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
void *CUFAllocManaged(std::size_t sizeInBytes) {
void *CUFAllocManaged(
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
CUDA_REPORT_IF_ERROR(
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -64,9 +67,10 @@ void *CUFAllocManaged(std::size_t sizeInBytes) {
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
void *CUFAllocUnified(std::size_t sizeInBytes) {
void *CUFAllocUnified(
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
// Call alloc managed for the time being.
return CUFAllocManaged(sizeInBytes);
return CUFAllocManaged(sizeInBytes, asyncId);
}
void CUFFreeUnified(void *p) {

View File

@@ -20,7 +20,8 @@ RT_EXT_API_GROUP_BEGIN
Descriptor *RTDEF(CUFAllocDescriptor)(
std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
return reinterpret_cast<Descriptor *>(
CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
}
void RTDEF(CUFFreeDescriptor)(

View File

@@ -170,7 +170,7 @@ RT_API_ATTRS int Descriptor::Allocate() {
// Zero size allocation is possible in Fortran and the resulting
// descriptor must be allocated/associated. Since std::malloc(0)
// result is implementation defined, always allocate at least one byte.
void *p{alloc(byteSize ? byteSize : 1)};
void *p{alloc(byteSize ? byteSize : 1, /*asyncId=*/-1)};
if (!p) {
return CFI_ERROR_MEM_ALLOCATION;
}

View File

@@ -129,7 +129,7 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
byteSize = ((byteSize + align - 1) / align) * align;
std::size_t total{byteSize + sizeof(std::uintptr_t)};
AllocFct alloc{allocatorRegistry.GetAllocator(allocatorIdx)};
void *p{alloc(total)};
void *p{alloc(total, /*asyncId=*/-1)};
if (p && allocatorIdx == 0) {
// Fill the footer word with the XOR of the ones' complement of
// the base address, which is a value that would be highly unlikely

View File

@@ -20,16 +20,16 @@ extern "C" {
void RTDECL(CUFRegisterAllocator)();
}
void *CUFAllocPinned(std::size_t);
void *CUFAllocPinned(std::size_t, std::int64_t);
void CUFFreePinned(void *);
void *CUFAllocDevice(std::size_t);
void *CUFAllocDevice(std::size_t, std::int64_t);
void CUFFreeDevice(void *);
void *CUFAllocManaged(std::size_t);
void *CUFAllocManaged(std::size_t, std::int64_t);
void CUFFreeManaged(void *);
void *CUFAllocUnified(std::size_t);
void *CUFAllocUnified(std::size_t, std::int64_t);
void CUFFreeUnified(void *);
} // namespace Fortran::runtime::cuda