From 478e5161406a781afc41e15bf942fb5df6672067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 6 Mar 2025 19:19:51 -0800 Subject: [PATCH] [flang][cuda] Sync double descriptor after c_f_pointer call (#130194) After a global device pointer is set through `c_f_pointer`, we need to sync the double descriptor so the version on the device is also up to date. --- flang/include/flang/Lower/Cuda.h | 21 --------- .../flang/Optimizer/Builder/CUFCommon.h | 4 +- .../Builder/Runtime/CUDA/Descriptor.h | 31 +++++++++++++ flang/lib/Lower/Allocatable.cpp | 4 +- flang/lib/Lower/Bridge.cpp | 2 +- flang/lib/Optimizer/Builder/CMakeLists.txt | 1 + flang/lib/Optimizer/Builder/CUFCommon.cpp | 44 +++++++++++++------ flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 14 ++++++ .../Builder/Runtime/CUDA/Descriptor.cpp | 34 ++++++++++++++ .../Optimizer/Transforms/CUFOpConversion.cpp | 12 +---- .../Transforms/SimplifyIntrinsics.cpp | 2 +- flang/test/Lower/CUDA/cuda-pointer.cuf | 23 +++++++++- 12 files changed, 141 insertions(+), 51 deletions(-) create mode 100644 flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h create mode 100644 flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp diff --git a/flang/include/flang/Lower/Cuda.h b/flang/include/flang/Lower/Cuda.h index d97045383d19..b6f849e3d63f 100644 --- a/flang/include/flang/Lower/Cuda.h +++ b/flang/include/flang/Lower/Cuda.h @@ -20,27 +20,6 @@ #include "mlir/Dialect/OpenACC/OpenACC.h" namespace Fortran::lower { -// Check if the insertion point is currently in a device context. HostDevice -// subprogram are not considered fully device context so it will return false -// for it. -// If the insertion point is inside an OpenACC region op, it is considered -// device context. -static bool inline isCudaDeviceContext(fir::FirOpBuilder &builder) { - if (builder.getRegion().getParentOfType()) - return true; - if (builder.getRegion() - .getParentOfType()) - return true; - if (auto funcOp = builder.getRegion().getParentOfType()) { - if (auto cudaProcAttr = - funcOp.getOperation()->getAttrOfType( - cuf::getProcAttrName())) { - return cudaProcAttr.getValue() != cuf::ProcAttribute::Host && - cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice; - } - } - return false; -} static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) { std::optional cudaAttr = diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h index b99e33042962..e3c7b5098b83 100644 --- a/flang/include/flang/Optimizer/Builder/CUFCommon.h +++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h @@ -25,8 +25,10 @@ namespace cuf { mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod, mlir::SymbolTable &symTab); -bool isInCUDADeviceContext(mlir::Operation *op); +bool isCUDADeviceContext(mlir::Operation *op); +bool isCUDADeviceContext(mlir::Region &); bool isRegisteredDeviceGlobal(fir::GlobalOp op); +bool isRegisteredDeviceAttr(std::optional attr); void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h new file mode 100644 index 000000000000..14d262bf22a7 --- /dev/null +++ b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h @@ -0,0 +1,31 @@ +//===-- Descriptor.h - CUDA descritpor runtime API calls --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_ +#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_ + +#include "mlir/IR/Value.h" + +namespace mlir { +class Location; +} // namespace mlir + +namespace fir { +class FirOpBuilder; +} + +namespace fir::runtime::cuda { + +/// Generate runtime call to sync the doublce descriptor referenced by +/// \p hostPtr. +void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value hostPtr); + +} // namespace fir::runtime::cuda + +#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_ diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp index 3d21e7a3fa8d..9938bd573d1f 100644 --- a/flang/lib/Lower/Allocatable.cpp +++ b/flang/lib/Lower/Allocatable.cpp @@ -470,7 +470,7 @@ private: void genSimpleAllocation(const Allocation &alloc, const fir::MutableBoxValue &box) { bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol()); - bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder); + bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion()); bool inlineAllocation = !box.isDerived() && !errorManager.hasStatSpec() && !alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() && !useAllocateRuntime && @@ -862,7 +862,7 @@ genDeallocate(fir::FirOpBuilder &builder, mlir::Value declaredTypeDesc = {}, const Fortran::semantics::Symbol *symbol = nullptr) { bool isCudaSymbol = symbol && Fortran::semantics::HasCUDAAttr(*symbol); - bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder); + bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion()); bool inlineDeallocation = !box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() && !box.isUnlimitedPolymorphic() && !errorManager.hasStatSpec() && diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 95f431983d44..e368974c92a3 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -4689,7 +4689,7 @@ private: mlir::Location loc = getCurrentLocation(); fir::FirOpBuilder &builder = getFirOpBuilder(); - bool isInDeviceContext = Fortran::lower::isCudaDeviceContext(builder); + bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion()); bool isCUDATransfer = IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext; diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt index f0563d092e3d..31ae395805fa 100644 --- a/flang/lib/Optimizer/Builder/CMakeLists.txt +++ b/flang/lib/Optimizer/Builder/CMakeLists.txt @@ -18,6 +18,7 @@ add_flang_library(FIRBuilder Runtime/Assign.cpp Runtime/Character.cpp Runtime/Command.cpp + Runtime/CUDA/Descriptor.cpp Runtime/Derived.cpp Runtime/EnvironmentDefaults.cpp Runtime/Exceptions.cpp diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp index 39848205f47a..5f286c04a7ca 100644 --- a/flang/lib/Optimizer/Builder/CUFCommon.cpp +++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp @@ -12,6 +12,7 @@ #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" /// Retrieve or create the CUDA Fortran GPU module in the give in \p mod. mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod, @@ -31,30 +32,45 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod, return gpuMod; } -bool cuf::isInCUDADeviceContext(mlir::Operation *op) { - if (!op) +bool cuf::isCUDADeviceContext(mlir::Operation *op) { + if (!op || !op->getParentRegion()) return false; - if (op->getParentOfType() || - op->getParentOfType()) + return isCUDADeviceContext(*op->getParentRegion()); +} + +// Check if the insertion point is currently in a device context. HostDevice +// subprogram are not considered fully device context so it will return false +// for it. +// If the insertion point is inside an OpenACC region op, it is considered +// device context. +bool cuf::isCUDADeviceContext(mlir::Region ®ion) { + if (region.getParentOfType()) return true; - if (auto funcOp = op->getParentOfType()) { - if (auto cudaProcAttr = funcOp->getAttrOfType( - cuf::getProcAttrName())) { - return cudaProcAttr.getValue() != cuf::ProcAttribute::Host; + if (region.getParentOfType()) + return true; + if (auto funcOp = region.getParentOfType()) { + if (auto cudaProcAttr = + funcOp.getOperation()->getAttrOfType( + cuf::getProcAttrName())) { + return cudaProcAttr.getValue() != cuf::ProcAttribute::Host && + cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice; } } return false; } +bool cuf::isRegisteredDeviceAttr(std::optional attr) { + if (attr && (*attr == cuf::DataAttribute::Device || + *attr == cuf::DataAttribute::Managed || + *attr == cuf::DataAttribute::Constant)) + return true; + return false; +} + bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) { if (op.getConstant()) return false; - auto attr = op.getDataAttr(); - if (attr && (*attr == cuf::DataAttribute::Device || - *attr == cuf::DataAttribute::Managed || - *attr == cuf::DataAttribute::Constant)) - return true; - return false; + return isRegisteredDeviceAttr(op.getDataAttr()); } void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) { diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index c52b0cbaf201..ede3be074a82 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -16,12 +16,14 @@ #include "flang/Optimizer/Builder/IntrinsicCall.h" #include "flang/Common/static-multimap-view.h" #include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/Character.h" #include "flang/Optimizer/Builder/Complex.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/MutableBox.h" #include "flang/Optimizer/Builder/PPCIntrinsicCall.h" #include "flang/Optimizer/Builder/Runtime/Allocatable.h" +#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h" #include "flang/Optimizer/Builder/Runtime/Character.h" #include "flang/Optimizer/Builder/Runtime/Command.h" #include "flang/Optimizer/Builder/Runtime/Derived.h" @@ -38,6 +40,7 @@ #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/FatalError.h" #include "flang/Optimizer/Support/Utils.h" #include "flang/Runtime/entry-names.h" @@ -3254,6 +3257,17 @@ void IntrinsicLibrary::genCFPointer(llvm::ArrayRef args) { fir::factory::associateMutableBox(builder, loc, *fPtr, getCPtrExtVal(*fPtr), /*lbounds=*/mlir::ValueRange{}); + + // If the pointer is a registered CUDA fortran variable, the descriptor needs + // to be synced. + if (auto declare = mlir::dyn_cast_or_null( + fPtr->getAddr().getDefiningOp())) + if (declare.getMemref().getDefiningOp() && + mlir::isa(declare.getMemref().getDefiningOp())) + if (cuf::isRegisteredDeviceAttr(declare.getDataAttr()) && + !cuf::isCUDADeviceContext(builder.getRegion())) + fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc, + declare.getMemref()); } // C_F_PROCPOINTER diff --git a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp new file mode 100644 index 000000000000..90662c094c65 --- /dev/null +++ b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp @@ -0,0 +1,34 @@ + +//===-- Allocatable.cpp -- Allocatable statements lowering ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Runtime/RTBuilder.h" +#include "flang/Runtime/CUDA/descriptor.h" + +using namespace Fortran::runtime::cuda; + +void fir::runtime::cuda::genSyncGlobalDescriptor(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value hostPtr) { + mlir::func::FuncOp callee = + fir::runtime::getRuntimeFunc(loc, + builder); + auto fTy = callee.getFunctionType(); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(2)); + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, hostPtr, sourceFile, sourceLine)}; + builder.create(loc, callee, args); +} diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 2ab2d84f1643..0fbec8a204b8 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -8,6 +8,7 @@ #include "flang/Optimizer/Transforms/CUFOpConversion.h" #include "flang/Optimizer/Builder/CUFCommon.h" +#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/CodeGen/TypeConverter.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" @@ -904,16 +905,7 @@ struct CUFSyncDescriptorOpConversion auto hostAddr = builder.create( loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName()); - mlir::func::FuncOp callee = - fir::runtime::getRuntimeFunc(loc, - builder); - auto fTy = callee.getFunctionType(); - mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); - mlir::Value sourceLine = - fir::factory::locationToLineNo(builder, loc, fTy.getInput(2)); - llvm::SmallVector args{fir::runtime::createArguments( - builder, loc, fTy, hostAddr, sourceFile, sourceLine)}; - builder.create(loc, callee, args); + fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc, hostAddr); op.erase(); return mlir::success(); } diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp index 3baac62dd69f..dd4d1783dac3 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp @@ -1279,7 +1279,7 @@ void SimplifyIntrinsicsPass::runOnOperation() { fir::KindMapping kindMap = fir::getKindMapping(module); module.walk([&](mlir::Operation *op) { if (auto call = mlir::dyn_cast(op)) { - if (cuf::isInCUDADeviceContext(op)) + if (cuf::isCUDADeviceContext(op)) return; if (mlir::SymbolRefAttr callee = call.getCalleeAttr()) { mlir::StringRef funcName = callee.getLeafReference().getValue(); diff --git a/flang/test/Lower/CUDA/cuda-pointer.cuf b/flang/test/Lower/CUDA/cuda-pointer.cuf index 2a9dbe54c292..e9614751673e 100644 --- a/flang/test/Lower/CUDA/cuda-pointer.cuf +++ b/flang/test/Lower/CUDA/cuda-pointer.cuf @@ -2,10 +2,31 @@ ! Test lowering of CUDA pointers. +module mod1 + +integer, device, pointer :: x(:) + +contains + subroutine allocate_pointer real, device, pointer :: pr(:) allocate(pr(10)) end -! CHECK-LABEL: func.func @_QPallocate_pointer() +! CHECK-LABEL: func.func @_QMmod1Pallocate_pointer() ! CHECK-COUNT-2: fir.embox %{{.*}} {allocator_idx = 2 : i32} : (!fir.ptr>, !fir.shape<1>) -> !fir.box>> + +subroutine c_f_pointer_sync + use iso_c_binding + use, intrinsic :: __fortran_builtins, only: c_devptr => __builtin_c_devptr + type(c_devptr) :: cd1 + integer, parameter :: N = 2000 + call c_f_pointer(cd1, x, (/ 2000 /)) +end + +! CHECK-LABEL: func.func @_QMmod1Pc_f_pointer_sync() +! CHECK: %[[ADDR_X:.*]] = fir.address_of(@_QMmod1Ex) : !fir.ref>>> +! CHECK: %[[CONV:.*]] = fir.convert %[[ADDR_X]] : (!fir.ref>>>) -> !fir.llvm_ptr +! CHECK: fir.call @_FortranACUFSyncGlobalDescriptor(%[[CONV]], %{{.*}}, %{{.*}}) fastmath : (!fir.llvm_ptr, !fir.ref, i32) -> () + +end module