[flang][cuda] Allocate extra descriptor in managed memory when it is coming from device (#140818)

This commit is contained in:
Valentin Clement (バレンタイン クレメン)
2025-05-20 18:55:13 -07:00
committed by GitHub
parent 7b51339387
commit 6811a3bedf
2 changed files with 23 additions and 3 deletions

View File

@@ -1830,7 +1830,9 @@ static bool isDeviceAllocation(mlir::Value val, mlir::Value adaptorVal) {
(callOp.getCallee().value().getRootReference().getValue().starts_with(
RTNAME_STRING(CUFMemAlloc)) ||
callOp.getCallee().value().getRootReference().getValue().starts_with(
RTNAME_STRING(CUFAllocDescriptor))))
RTNAME_STRING(CUFAllocDescriptor)) ||
callOp.getCallee().value().getRootReference().getValue() ==
"__tgt_acc_get_deviceptr"))
return true;
return false;
}
@@ -3253,8 +3255,9 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
if (auto callOp = mlir::dyn_cast_or_null<mlir::LLVM::CallOp>(
inputBoxStorage.getDefiningOp())) {
if (callOp.getCallee() &&
(*callOp.getCallee())
.starts_with(RTNAME_STRING(CUFAllocDescriptor))) {
((*callOp.getCallee())
.starts_with(RTNAME_STRING(CUFAllocDescriptor)) ||
(*callOp.getCallee()).starts_with("__tgt_acc_get_deviceptr"))) {
// CUDA Fortran local descriptor are allocated in managed memory. So
// new storage must be allocated the same way.
auto mod = load->getParentOfType<mlir::ModuleOp>();

View File

@@ -204,3 +204,20 @@ func.func @_QMm1Psub1(%arg0: !fir.box<!fir.array<?xi32>> {cuf.data_attr = #cuf.c
fir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
// CHECK: llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
// -----
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
func.func @_QQmain() attributes {fir.bindc_name = "cufkernel_global"} {
%c0 = arith.constant 0 : index
%3 = fir.call @__tgt_acc_get_deviceptr() : () -> !fir.ref<!fir.box<none>>
%4 = fir.convert %3 : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
%5 = fir.load %4 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
return
}
// CHECK-LABEL: llvm.func @_QQmain()
// CHECK: llvm.call @_FortranACUFAllocDescriptor
func.func private @__tgt_acc_get_deviceptr() -> !fir.ref<!fir.box<none>>
}