//===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements a pass to convert gpu.launch_func op into a sequence of // GPU runtime calls. As most of GPU runtimes does not have a stable published // ABI, this pass uses a slim runtime layer that builds on top of the public // API from GPU runtime headers. // //===----------------------------------------------------------------------===// #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "../PassDetail.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Function.h" #include "mlir/IR/Module.h" #include "mlir/IR/StandardTypes.h" #include "llvm/ADT/STLExtras.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" using namespace mlir; // To avoid name mangling, these are defined in the mini-runtime file. static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad"; static constexpr const char *kGpuModuleGetFunctionName = "mgpuModuleGetFunction"; static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel"; static constexpr const char *kGpuStreamCreateName = "mgpuStreamCreate"; static constexpr const char *kGpuStreamSynchronizeName = "mgpuStreamSynchronize"; static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister"; static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; namespace { /// A pass to convert gpu.launch_func operations into a sequence of GPU /// runtime calls. Currently it supports CUDA and ROCm (HIP). /// /// In essence, a gpu.launch_func operations gets compiled into the following /// sequence of runtime calls: /// /// * moduleLoad -- loads the module given the cubin / hsaco data /// * moduleGetFunction -- gets a handle to the actual kernel function /// * getStreamHelper -- initializes a new compute stream on GPU /// * launchKernel -- launches the kernel on a stream /// * streamSynchronize -- waits for operations on the stream to finish /// /// Intermediate data structures are allocated on the stack. class GpuLaunchFuncToGpuRuntimeCallsPass : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase< GpuLaunchFuncToGpuRuntimeCallsPass> { private: LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; } llvm::LLVMContext &getLLVMContext() { return getLLVMDialect()->getLLVMContext(); } void initializeCachedTypes() { const llvm::Module &module = llvmDialect->getLLVMModule(); llvmVoidType = LLVM::LLVMType::getVoidTy(llvmDialect); llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect); llvmPointerPointerType = llvmPointerType.getPointerTo(); llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect); llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect); llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect); llvmIntPtrType = LLVM::LLVMType::getIntNTy( llvmDialect, module.getDataLayout().getPointerSizeInBits()); } LLVM::LLVMType getVoidType() { return llvmVoidType; } LLVM::LLVMType getPointerType() { return llvmPointerType; } LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; } LLVM::LLVMType getInt8Type() { return llvmInt8Type; } LLVM::LLVMType getInt32Type() { return llvmInt32Type; } LLVM::LLVMType getInt64Type() { return llvmInt64Type; } LLVM::LLVMType getIntPtrType() { const llvm::Module &module = getLLVMDialect()->getLLVMModule(); return LLVM::LLVMType::getIntNTy( getLLVMDialect(), module.getDataLayout().getPointerSizeInBits()); } // Allocate a void pointer on the stack. Value allocatePointer(OpBuilder &builder, Location loc) { auto one = builder.create(loc, getInt32Type(), builder.getI32IntegerAttr(1)); return builder.create(loc, getPointerPointerType(), one, /*alignment=*/0); } void declareGpuRuntimeFunctions(Location loc); void addParamToList(OpBuilder &builder, Location loc, Value param, Value list, unsigned pos, Value one); Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder); Value generateKernelNameConstant(StringRef moduleName, StringRef name, Location loc, OpBuilder &builder); void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp); public: GpuLaunchFuncToGpuRuntimeCallsPass() = default; GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) { this->gpuBinaryAnnotation = gpuBinaryAnnotation.str(); } // Run the dialect converter on the module. void runOnOperation() override { // Cache the LLVMDialect for the current module. llvmDialect = getContext().getRegisteredDialect(); // Cache the used LLVM types. initializeCachedTypes(); getOperation().walk( [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); }); // GPU kernel modules are no longer necessary since we have a global // constant with the CUBIN, or HSACO data. for (auto m : llvm::make_early_inc_range(getOperation().getOps())) m.erase(); } private: LLVM::LLVMDialect *llvmDialect; LLVM::LLVMType llvmVoidType; LLVM::LLVMType llvmPointerType; LLVM::LLVMType llvmPointerPointerType; LLVM::LLVMType llvmInt8Type; LLVM::LLVMType llvmInt32Type; LLVM::LLVMType llvmInt64Type; LLVM::LLVMType llvmIntPtrType; }; } // anonymous namespace // Adds declarations for the needed helper functions from the runtime wrappers. // The types in comments give the actual types expected/returned but the API // uses void pointers. This is fine as they have the same linkage in C. void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions( Location loc) { ModuleOp module = getOperation(); OpBuilder builder(module.getBody()->getTerminator()); if (!module.lookupSymbol(kGpuModuleLoadName)) { builder.create( loc, kGpuModuleLoadName, LLVM::LLVMType::getFunctionTy(getPointerType(), {getPointerType()}, /* void *cubin */ /*isVarArg=*/false)); } if (!module.lookupSymbol(kGpuModuleGetFunctionName)) { // The helper uses void* instead of CUDA's opaque CUmodule and // CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t. builder.create( loc, kGpuModuleGetFunctionName, LLVM::LLVMType::getFunctionTy(getPointerType(), { getPointerType(), /* void *module */ getPointerType() /* char *name */ }, /*isVarArg=*/false)); } if (!module.lookupSymbol(kGpuLaunchKernelName)) { // Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to // match the LLVM type if MLIR's index type, which the GPU dialect uses. // Furthermore, they use void* instead of CUDA's opaque CUfunction and // CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t. builder.create( loc, kGpuLaunchKernelName, LLVM::LLVMType::getFunctionTy( getVoidType(), { getPointerType(), /* void* f */ getIntPtrType(), /* intptr_t gridXDim */ getIntPtrType(), /* intptr_t gridyDim */ getIntPtrType(), /* intptr_t gridZDim */ getIntPtrType(), /* intptr_t blockXDim */ getIntPtrType(), /* intptr_t blockYDim */ getIntPtrType(), /* intptr_t blockZDim */ getInt32Type(), /* unsigned int sharedMemBytes */ getPointerType(), /* void *hstream */ getPointerPointerType(), /* void **kernelParams */ getPointerPointerType() /* void **extra */ }, /*isVarArg=*/false)); } if (!module.lookupSymbol(kGpuStreamCreateName)) { // Helper function to get the current GPU compute stream. Uses void* // instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t. builder.create( loc, kGpuStreamCreateName, LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false)); } if (!module.lookupSymbol(kGpuStreamSynchronizeName)) { builder.create( loc, kGpuStreamSynchronizeName, LLVM::LLVMType::getFunctionTy(getVoidType(), {getPointerType()}, /* void *stream */ /*isVarArg=*/false)); } if (!module.lookupSymbol(kGpuMemHostRegisterName)) { builder.create( loc, kGpuMemHostRegisterName, LLVM::LLVMType::getFunctionTy(getVoidType(), { getPointerType(), /* void *ptr */ getInt64Type() /* int64 sizeBytes*/ }, /*isVarArg=*/false)); } } /// Emits the IR with the following structure: /// /// %data = llvm.alloca 1 x type-of() /// llvm.store , %data /// %typeErased = llvm.bitcast %data to !llvm<"i8*"> /// %addr = llvm.getelementptr [] /// llvm.store %typeErased, %addr /// /// This is necessary to construct the list of arguments passed to the kernel /// function as accepted by cuLaunchKernel, i.e. as a void** that points to list /// of stack-allocated type-erased pointers to the actual arguments. void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder, Location loc, Value param, Value list, unsigned pos, Value one) { auto memLocation = builder.create( loc, param.getType().cast().getPointerTo(), one, /*alignment=*/1); builder.create(loc, param, memLocation); auto casted = builder.create(loc, getPointerType(), memLocation); auto index = builder.create(loc, getInt32Type(), builder.getI32IntegerAttr(pos)); auto gep = builder.create(loc, getPointerPointerType(), list, ArrayRef{index}); builder.create(loc, casted, gep); } // Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel // launch call. The arguments are extracted from the launchOp. // The generated code is essentially as follows: // // %array = alloca(numparams * sizeof(void *)) // for (i : [0, NumKernelOperands)) // %array[i] = cast(KernelOperand[i]) // return %array Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray( gpu::LaunchFuncOp launchOp, OpBuilder &builder) { // Get the launch target. auto gpuFunc = SymbolTable::lookupNearestSymbolFrom( launchOp, launchOp.kernel()); if (!gpuFunc) return {}; unsigned numArgs = gpuFunc.getNumArguments(); auto numKernelOperands = launchOp.getNumKernelOperands(); Location loc = launchOp.getLoc(); auto one = builder.create(loc, getInt32Type(), builder.getI32IntegerAttr(1)); auto arraySize = builder.create( loc, getInt32Type(), builder.getI32IntegerAttr(numArgs)); auto array = builder.create(loc, getPointerPointerType(), arraySize, /*alignment=*/0); unsigned pos = 0; for (unsigned idx = 0; idx < numKernelOperands; ++idx) { auto operand = launchOp.getKernelOperand(idx); auto llvmType = operand.getType().cast(); // Assume all struct arguments come from MemRef. If this assumption does not // hold anymore then we `launchOp` to lower from MemRefType and not after // LLVMConversion has taken place and the MemRef information is lost. if (!llvmType.isStructTy()) { addParamToList(builder, loc, operand, array, pos++, one); continue; } // Put individual components of a memref descriptor into the flat argument // list. We cannot use unpackMemref from LLVM lowering here because we have // no access to MemRefType that had been lowered away. for (int32_t j = 0, ej = llvmType.getStructNumElements(); j < ej; ++j) { auto elemType = llvmType.getStructElementType(j); if (elemType.isArrayTy()) { for (int32_t k = 0, ek = elemType.getArrayNumElements(); k < ek; ++k) { Value elem = builder.create( loc, elemType.getArrayElementType(), operand, builder.getI32ArrayAttr({j, k})); addParamToList(builder, loc, elem, array, pos++, one); } } else { assert((elemType.isIntegerTy() || elemType.isFloatTy() || elemType.isDoubleTy() || elemType.isPointerTy()) && "expected scalar type"); Value strct = builder.create( loc, elemType, operand, builder.getI32ArrayAttr(j)); addParamToList(builder, loc, strct, array, pos++, one); } } } return array; } // Generates an LLVM IR dialect global that contains the name of the given // kernel function as a C string, and returns a pointer to its beginning. // The code is essentially: // // llvm.global constant @kernel_name("function_name\00") // func(...) { // %0 = llvm.addressof @kernel_name // %1 = llvm.constant (0 : index) // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> // } Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant( StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) { // Make sure the trailing zero is included in the constant. std::vector kernelName(name.begin(), name.end()); kernelName.push_back('\0'); std::string globalName = std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name)); return LLVM::createGlobalString( loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()), LLVM::Linkage::Internal, llvmDialect); } // Emits LLVM IR to launch a kernel function. Expects the module that contains // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. // // %0 = call %binarygetter // %1 = call %moduleLoad(%0) // %2 = // %3 = call %moduleGetFunction(%1, %2) // %4 = call %streamCreate() // %5 = // call %launchKernel(%3, , 0, %4, %5, nullptr) // call %streamSynchronize(%4) void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls( mlir::gpu::LaunchFuncOp launchOp) { OpBuilder builder(launchOp); Location loc = launchOp.getLoc(); declareGpuRuntimeFunctions(loc); auto zero = builder.create(loc, getInt32Type(), builder.getI32IntegerAttr(0)); // Create an LLVM global with CUBIN extracted from the kernel annotation and // obtain a pointer to the first byte in it. auto kernelModule = getOperation().lookupSymbol( launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); auto binaryAttr = kernelModule.getAttrOfType(gpuBinaryAnnotation); if (!binaryAttr) { kernelModule.emitOpError() << "missing " << gpuBinaryAnnotation << " attribute"; return signalPassFailure(); } SmallString<128> nameBuffer(kernelModule.getName()); nameBuffer.append(kGpuBinaryStorageSuffix); Value data = LLVM::createGlobalString( loc, builder, nameBuffer.str(), binaryAttr.getValue(), LLVM::Linkage::Internal, getLLVMDialect()); // Emit the load module call to load the module data. Error checking is done // in the called helper function. auto gpuModuleLoad = getOperation().lookupSymbol(kGpuModuleLoadName); auto module = builder.create( loc, ArrayRef{getPointerType()}, builder.getSymbolRefAttr(gpuModuleLoad), ArrayRef{data}); // Get the function from the module. The name corresponds to the name of // the kernel function. auto kernelName = generateKernelNameConstant( launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder); auto gpuModuleGetFunction = getOperation().lookupSymbol(kGpuModuleGetFunctionName); auto function = builder.create( loc, ArrayRef{getPointerType()}, builder.getSymbolRefAttr(gpuModuleGetFunction), ArrayRef{module.getResult(0), kernelName}); // Grab the global stream needed for execution. auto gpuStreamCreate = getOperation().lookupSymbol(kGpuStreamCreateName); auto stream = builder.create( loc, ArrayRef{getPointerType()}, builder.getSymbolRefAttr(gpuStreamCreate), ArrayRef{}); // Invoke the function with required arguments. auto gpuLaunchKernel = getOperation().lookupSymbol(kGpuLaunchKernelName); auto paramsArray = setupParamsArray(launchOp, builder); if (!paramsArray) { launchOp.emitOpError() << "cannot pass given parameters to the kernel"; return signalPassFailure(); } auto nullpointer = builder.create(loc, getPointerPointerType(), zero); builder.create( loc, ArrayRef{getVoidType()}, builder.getSymbolRefAttr(gpuLaunchKernel), ArrayRef{function.getResult(0), launchOp.getOperand(0), launchOp.getOperand(1), launchOp.getOperand(2), launchOp.getOperand(3), launchOp.getOperand(4), launchOp.getOperand(5), zero, /* sharedMemBytes */ stream.getResult(0), /* stream */ paramsArray, /* kernel params */ nullpointer /* extra */}); // Sync on the stream to make it synchronous. auto gpuStreamSync = getOperation().lookupSymbol(kGpuStreamSynchronizeName); builder.create(loc, ArrayRef{getVoidType()}, builder.getSymbolRefAttr(gpuStreamSync), ArrayRef(stream.getResult(0))); launchOp.erase(); } std::unique_ptr> mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass( StringRef gpuBinaryAnnotation) { if (gpuBinaryAnnotation.empty()) return std::make_unique(); return std::make_unique( gpuBinaryAnnotation); }