diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 70c90fae3408..205807eab403 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -1830,7 +1830,9 @@ static bool isDeviceAllocation(mlir::Value val, mlir::Value adaptorVal) { (callOp.getCallee().value().getRootReference().getValue().starts_with( RTNAME_STRING(CUFMemAlloc)) || callOp.getCallee().value().getRootReference().getValue().starts_with( - RTNAME_STRING(CUFAllocDescriptor)))) + RTNAME_STRING(CUFAllocDescriptor)) || + callOp.getCallee().value().getRootReference().getValue() == + "__tgt_acc_get_deviceptr")) return true; return false; } @@ -3253,8 +3255,9 @@ struct LoadOpConversion : public fir::FIROpConversion { if (auto callOp = mlir::dyn_cast_or_null( inputBoxStorage.getDefiningOp())) { if (callOp.getCallee() && - (*callOp.getCallee()) - .starts_with(RTNAME_STRING(CUFAllocDescriptor))) { + ((*callOp.getCallee()) + .starts_with(RTNAME_STRING(CUFAllocDescriptor)) || + (*callOp.getCallee()).starts_with("__tgt_acc_get_deviceptr"))) { // CUDA Fortran local descriptor are allocated in managed memory. So // new storage must be allocated the same way. auto mod = load->getParentOfType(); diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir index fdd9f1ac12b1..672be13beae2 100644 --- a/flang/test/Fir/CUDA/cuda-code-gen.mlir +++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir @@ -204,3 +204,20 @@ func.func @_QMm1Psub1(%arg0: !fir.box> {cuf.data_attr = #cuf.c fir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda} : !fir.array<28xi8> // CHECK: llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8> + +// ----- + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} { + func.func @_QQmain() attributes {fir.bindc_name = "cufkernel_global"} { + %c0 = arith.constant 0 : index + %3 = fir.call @__tgt_acc_get_deviceptr() : () -> !fir.ref> + %4 = fir.convert %3 : (!fir.ref>) -> !fir.ref>>> + %5 = fir.load %4 : !fir.ref>>> + return + } + + // CHECK-LABEL: llvm.func @_QQmain() + // CHECK: llvm.call @_FortranACUFAllocDescriptor + + func.func private @__tgt_acc_get_deviceptr() -> !fir.ref> +}