From 0b31f08537746beff4d5e0df44221cbe5a9237c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 31 Mar 2025 13:17:21 -0700
Subject: [PATCH] [flang][cuda] Add support for NV_CUDAFOR_DEVICE_IS_MANAGED
 (#133778)

Add support for the environment variable `NV_CUDAFOR_DEVICE_IS_MANAGED`
as described in the documentation:
https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#controlling-device-data-is-managed.

This mainly switch device allocation to managed allocation.
---
 flang-rt/include/flang-rt/runtime/environment.h |  1 +
 flang-rt/lib/cuda/allocator.cpp                 |  8 +++++++-
 flang-rt/lib/cuda/memory.cpp                    |  8 +++++++-
 flang-rt/lib/runtime/environment.cpp            | 13 +++++++++++++
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h
index 142add432b5f..ca6c2a7d4448 100644
--- a/flang-rt/include/flang-rt/runtime/environment.h
+++ b/flang-rt/include/flang-rt/runtime/environment.h
@@ -59,6 +59,7 @@ struct ExecutionEnvironment {
 
   // CUDA related variables
   std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE
+  bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED
 };
 
 RT_OFFLOAD_VAR_GROUP_BEGIN
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index 4199bf04b33f..d6529957bc93 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -9,6 +9,7 @@
 #include "flang/Runtime/CUDA/allocator.h"
 #include "flang-rt/runtime/allocator-registry.h"
 #include "flang-rt/runtime/derived.h"
+#include "flang-rt/runtime/environment.h"
 #include "flang-rt/runtime/stat.h"
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/type-info.h"
@@ -43,7 +44,12 @@ void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
 void *CUFAllocDevice(std::size_t sizeInBytes) {
   void *p;
-  CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+  if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
+    CUDA_REPORT_IF_ERROR(
+        cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
+  } else {
+    CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+  }
   return p;
 }
 
diff --git a/flang-rt/lib/cuda/memory.cpp b/flang-rt/lib/cuda/memory.cpp
index adc24ff22372..766f6847946c 100644
--- a/flang-rt/lib/cuda/memory.cpp
+++ b/flang-rt/lib/cuda/memory.cpp
@@ -9,6 +9,7 @@
 #include "flang/Runtime/CUDA/memory.h"
 #include "flang-rt/runtime/assign-impl.h"
 #include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/environment.h"
 #include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
@@ -26,7 +27,12 @@ void *RTDEF(CUFMemAlloc)(
   void *ptr = nullptr;
   if (bytes != 0) {
     if (type == kMemTypeDevice) {
-      CUDA_REPORT_IF_ERROR(cudaMalloc((void **)&ptr, bytes));
+      if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
+        CUDA_REPORT_IF_ERROR(
+            cudaMallocManaged((void **)&ptr, bytes, cudaMemAttachGlobal));
+      } else {
+        CUDA_REPORT_IF_ERROR(cudaMalloc((void **)&ptr, bytes));
+      }
     } else if (type == kMemTypeManaged || type == kMemTypeUnified) {
       CUDA_REPORT_IF_ERROR(
           cudaMallocManaged((void **)&ptr, bytes, cudaMemAttachGlobal));
diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index 15380ba148df..cf2c65dd4fac 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -155,6 +155,19 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
+  if (auto *x{std::getenv("NV_CUDAFOR_DEVICE_IS_MANAGED")}) {
+    char *end;
+    auto n{std::strtol(x, &end, 10)};
+    if (n >= 0 && n <= 1 && *end == '\0') {
+      cudaDeviceIsManaged = n != 0;
+    } else {
+      std::fprintf(stderr,
+          "Fortran runtime: NV_CUDAFOR_DEVICE_IS_MANAGED=%s is invalid; "
+          "ignored\n",
+          x);
+    }
+  }
+
   // TODO: Set RP/ROUND='PROCESSOR_DEFINED' from environment
 }