diff --git a/flang-rt/lib/cuda/kernel.cpp b/flang-rt/lib/cuda/kernel.cpp index 75eb639817b9..6b60b72630a1 100644 --- a/flang-rt/lib/cuda/kernel.cpp +++ b/flang-rt/lib/cuda/kernel.cpp @@ -16,7 +16,7 @@ extern "C" { void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, - int32_t smem, void **params, void **extra) { + intptr_t stream, int32_t smem, void **params, void **extra) { dim3 gridDim; gridDim.x = gridX; gridDim.y = gridY; @@ -74,15 +74,15 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY, Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; terminator.Crash("Too many invalid grid dimensions"); } - cudaStream_t stream = 0; // TODO stream managment + cudaStream_t cuStream = 0; // TODO stream managment CUDA_REPORT_IF_ERROR( - cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream)); + cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, cuStream)); } void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, - int32_t smem, void **params, void **extra) { + intptr_t stream, int32_t smem, void **params, void **extra) { cudaLaunchConfig_t config; config.gridDim.x = gridX; config.gridDim.y = gridY; @@ -153,7 +153,8 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, - intptr_t blockZ, int32_t smem, void **params, void **extra) { + intptr_t blockZ, intptr_t stream, int32_t smem, void **params, + void **extra) { dim3 gridDim; gridDim.x = gridX; gridDim.y = gridY; @@ -211,9 +212,9 @@ void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX, Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; terminator.Crash("Too many invalid grid dimensions"); } - cudaStream_t stream = 0; // TODO stream managment + cudaStream_t cuStream = 0; // TODO stream managment CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel( - kernel, gridDim, blockDim, params, smem, stream)); + kernel, gridDim, blockDim, params, smem, cuStream)); } } // extern "C" diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h index 1f812b580327..eb9135868fde 100644 --- a/flang/include/flang/Runtime/CUDA/kernel.h +++ b/flang/include/flang/Runtime/CUDA/kernel.h @@ -21,16 +21,18 @@ extern "C" { void RTDEF(CUFLaunchKernel)(const void *kernelName, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, - intptr_t blockZ, int32_t smem, void **params, void **extra); + intptr_t blockZ, intptr_t stream, int32_t smem, void **params, + void **extra); void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX, intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, - int32_t smem, void **params, void **extra); + intptr_t stream, int32_t smem, void **params, void **extra); void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, - intptr_t blockZ, int32_t smem, void **params, void **extra); + intptr_t blockZ, intptr_t stream, int32_t smem, void **params, + void **extra); } // extern "C" diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp index 74260c5b5c2a..205acbfea22b 100644 --- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -121,7 +121,7 @@ struct GPULaunchKernelConversion voidTy, {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, - llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy}, + llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy}, /*isVarArg=*/false); auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get( mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel)); @@ -133,6 +133,10 @@ struct GPULaunchKernelConversion launchKernelFuncOp.setVisibility( mlir::SymbolTable::Visibility::Private); } + mlir::Value stream = adaptor.getAsyncObject(); + if (!stream) + stream = rewriter.create( + loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1)); rewriter.replaceOpWithNewOp( op, funcTy, cufLaunchClusterKernel, mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(), @@ -140,7 +144,7 @@ struct GPULaunchKernelConversion adaptor.getGridSizeX(), adaptor.getGridSizeY(), adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(), - dynamicMemorySize, kernelArgs, nullPtr}); + stream, dynamicMemorySize, kernelArgs, nullPtr}); } else { auto procAttr = op->getAttrOfType(cuf::getProcAttrName()); @@ -153,7 +157,8 @@ struct GPULaunchKernelConversion auto funcTy = mlir::LLVM::LLVMFunctionType::get( voidTy, {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, - llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy}, + llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, + i32Ty, ptrTy, ptrTy}, /*isVarArg=*/false); auto cufLaunchKernel = mlir::SymbolRefAttr::get(mod.getContext(), fctName); @@ -165,12 +170,18 @@ struct GPULaunchKernelConversion launchKernelFuncOp.setVisibility( mlir::SymbolTable::Visibility::Private); } + + mlir::Value stream = adaptor.getAsyncObject(); + if (!stream) + stream = rewriter.create( + loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1)); + rewriter.replaceOpWithNewOp( op, funcTy, cufLaunchKernel, mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(), adaptor.getGridSizeY(), adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), - adaptor.getBlockSizeZ(), dynamicMemorySize, + adaptor.getBlockSizeZ(), stream, dynamicMemorySize, kernelArgs, nullPtr}); } diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir index 0827e378c7c0..85266f17bb67 100644 --- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir +++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir @@ -113,7 +113,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : ve // ----- module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { - llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} + llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims} { llvm.return }