From 0b31f08537746beff4d5e0df44221cbe5a9237c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Mon, 31 Mar 2025 13:17:21 -0700 Subject: [PATCH] [flang][cuda] Add support for NV_CUDAFOR_DEVICE_IS_MANAGED (#133778) Add support for the environment variable `NV_CUDAFOR_DEVICE_IS_MANAGED` as described in the documentation: https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#controlling-device-data-is-managed. This mainly switch device allocation to managed allocation. --- flang-rt/include/flang-rt/runtime/environment.h | 1 + flang-rt/lib/cuda/allocator.cpp | 8 +++++++- flang-rt/lib/cuda/memory.cpp | 8 +++++++- flang-rt/lib/runtime/environment.cpp | 13 +++++++++++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h index 142add432b5f..ca6c2a7d4448 100644 --- a/flang-rt/include/flang-rt/runtime/environment.h +++ b/flang-rt/include/flang-rt/runtime/environment.h @@ -59,6 +59,7 @@ struct ExecutionEnvironment { // CUDA related variables std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE + bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED }; RT_OFFLOAD_VAR_GROUP_BEGIN diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp index 4199bf04b33f..d6529957bc93 100644 --- a/flang-rt/lib/cuda/allocator.cpp +++ b/flang-rt/lib/cuda/allocator.cpp @@ -9,6 +9,7 @@ #include "flang/Runtime/CUDA/allocator.h" #include "flang-rt/runtime/allocator-registry.h" #include "flang-rt/runtime/derived.h" +#include "flang-rt/runtime/environment.h" #include "flang-rt/runtime/stat.h" #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/type-info.h" @@ -43,7 +44,12 @@ void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); } void *CUFAllocDevice(std::size_t sizeInBytes) { void *p; - CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); + if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) { + CUDA_REPORT_IF_ERROR( + cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal)); + } else { + CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); + } return p; } diff --git a/flang-rt/lib/cuda/memory.cpp b/flang-rt/lib/cuda/memory.cpp index adc24ff22372..766f6847946c 100644 --- a/flang-rt/lib/cuda/memory.cpp +++ b/flang-rt/lib/cuda/memory.cpp @@ -9,6 +9,7 @@ #include "flang/Runtime/CUDA/memory.h" #include "flang-rt/runtime/assign-impl.h" #include "flang-rt/runtime/descriptor.h" +#include "flang-rt/runtime/environment.h" #include "flang-rt/runtime/terminator.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" @@ -26,7 +27,12 @@ void *RTDEF(CUFMemAlloc)( void *ptr = nullptr; if (bytes != 0) { if (type == kMemTypeDevice) { - CUDA_REPORT_IF_ERROR(cudaMalloc((void **)&ptr, bytes)); + if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) { + CUDA_REPORT_IF_ERROR( + cudaMallocManaged((void **)&ptr, bytes, cudaMemAttachGlobal)); + } else { + CUDA_REPORT_IF_ERROR(cudaMalloc((void **)&ptr, bytes)); + } } else if (type == kMemTypeManaged || type == kMemTypeUnified) { CUDA_REPORT_IF_ERROR( cudaMallocManaged((void **)&ptr, bytes, cudaMemAttachGlobal)); diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp index 15380ba148df..cf2c65dd4fac 100644 --- a/flang-rt/lib/runtime/environment.cpp +++ b/flang-rt/lib/runtime/environment.cpp @@ -155,6 +155,19 @@ void ExecutionEnvironment::Configure(int ac, const char *av[], } } + if (auto *x{std::getenv("NV_CUDAFOR_DEVICE_IS_MANAGED")}) { + char *end; + auto n{std::strtol(x, &end, 10)}; + if (n >= 0 && n <= 1 && *end == '\0') { + cudaDeviceIsManaged = n != 0; + } else { + std::fprintf(stderr, + "Fortran runtime: NV_CUDAFOR_DEVICE_IS_MANAGED=%s is invalid; " + "ignored\n", + x); + } + } + // TODO: Set RP/ROUND='PROCESSOR_DEFINED' from environment }