[flang][cuda] Add async id to allocators (#134724)

Add async id to allocators in preparation for stream allocation.
2025-04-08 10:16:59 -07:00
parent 7117dea043
commit 5ebe22a35d
6 changed files with 23 additions and 16 deletions
--- a/flang-rt/include/flang-rt/runtime/allocator-registry.h
+++ b/flang-rt/include/flang-rt/runtime/allocator-registry.h
@@ -11,6 +11,7 @@

 #include "flang/Common/api-attrs.h"
 #include "flang/Runtime/allocator-registry-consts.h"
+#include <cstdint>
 #include <cstdlib>
 #include <vector>

@@ -18,7 +19,7 @@

 namespace Fortran::runtime {

-using AllocFct = void *(*)(std::size_t);
+using AllocFct = void *(*)(std::size_t, std::int64_t);
 using FreeFct = void (*)(void *);

 typedef struct Allocator_t {
@@ -26,10 +27,11 @@ typedef struct Allocator_t {
  FreeFct free{nullptr};
 } Allocator_t;

-#ifdef RT_DEVICE_COMPILATION
-static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
+static RT_API_ATTRS void *MallocWrapper(
+    std::size_t size, [[maybe_unused]] std::int64_t) {
  return std::malloc(size);
 }
+#ifdef RT_DEVICE_COMPILATION
 static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
 #endif

@@ -39,7 +41,7 @@ struct AllocatorRegistry {
      : allocators{{&MallocWrapper, &FreeWrapper}} {}
 #else
  constexpr AllocatorRegistry() {
-    allocators[kDefaultAllocator] = {&std::malloc, &std::free};
+    allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
  };
 #endif
  RT_API_ATTRS void Register(int, Allocator_t);
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -34,7 +34,8 @@ void RTDEF(CUFRegisterAllocator)() {
 }
 }

-void *CUFAllocPinned(std::size_t sizeInBytes) {
+void *CUFAllocPinned(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
  void *p;
  CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
  return p;
@@ -42,7 +43,8 @@ void *CUFAllocPinned(std::size_t sizeInBytes) {

 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }

-void *CUFAllocDevice(std::size_t sizeInBytes) {
+void *CUFAllocDevice(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
  void *p;
  if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
    CUDA_REPORT_IF_ERROR(
@@ -55,7 +57,8 @@ void *CUFAllocDevice(std::size_t sizeInBytes) {

 void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }

-void *CUFAllocManaged(std::size_t sizeInBytes) {
+void *CUFAllocManaged(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
  void *p;
  CUDA_REPORT_IF_ERROR(
      cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -64,9 +67,10 @@ void *CUFAllocManaged(std::size_t sizeInBytes) {

 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }

-void *CUFAllocUnified(std::size_t sizeInBytes) {
+void *CUFAllocUnified(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
  // Call alloc managed for the time being.
-  return CUFAllocManaged(sizeInBytes);
+  return CUFAllocManaged(sizeInBytes, asyncId);
 }

 void CUFFreeUnified(void *p) {
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -20,7 +20,8 @@ RT_EXT_API_GROUP_BEGIN

 Descriptor *RTDEF(CUFAllocDescriptor)(
    std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
-  return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+  return reinterpret_cast<Descriptor *>(
+      CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
 }

 void RTDEF(CUFFreeDescriptor)(
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -170,7 +170,7 @@ RT_API_ATTRS int Descriptor::Allocate() {
  // Zero size allocation is possible in Fortran and the resulting
  // descriptor must be allocated/associated. Since std::malloc(0)
  // result is implementation defined, always allocate at least one byte.
-  void *p{alloc(byteSize ? byteSize : 1)};
+  void *p{alloc(byteSize ? byteSize : 1, /*asyncId=*/-1)};
  if (!p) {
    return CFI_ERROR_MEM_ALLOCATION;
  }
--- a/flang-rt/lib/runtime/pointer.cpp
+++ b/flang-rt/lib/runtime/pointer.cpp
@@ -129,7 +129,7 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
  byteSize = ((byteSize + align - 1) / align) * align;
  std::size_t total{byteSize + sizeof(std::uintptr_t)};
  AllocFct alloc{allocatorRegistry.GetAllocator(allocatorIdx)};
-  void *p{alloc(total)};
+  void *p{alloc(total, /*asyncId=*/-1)};
  if (p && allocatorIdx == 0) {
    // Fill the footer word with the XOR of the ones' complement of
    // the base address, which is a value that would be highly unlikely
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -20,16 +20,16 @@ extern "C" {
 void RTDECL(CUFRegisterAllocator)();
 }

-void *CUFAllocPinned(std::size_t);
+void *CUFAllocPinned(std::size_t, std::int64_t);
 void CUFFreePinned(void *);

-void *CUFAllocDevice(std::size_t);
+void *CUFAllocDevice(std::size_t, std::int64_t);
 void CUFFreeDevice(void *);

-void *CUFAllocManaged(std::size_t);
+void *CUFAllocManaged(std::size_t, std::int64_t);
 void CUFFreeManaged(void *);

-void *CUFAllocUnified(std::size_t);
+void *CUFAllocUnified(std::size_t, std::int64_t);
 void CUFFreeUnified(void *);

 } // namespace Fortran::runtime::cuda