Files
clang-p2996/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
Nicolas Vasilache 7c4e8c6a27 [mlir] Disentangle dialect and extension registrations.
This revision avoids the registration of dialect extensions in Pass::getDependentDialects.

Such registration of extensions can be dangerous because `DialectRegistry::isSubsetOf` is
always guaranteed to return false for extensions (i.e. there is no mechanism to track
whether a lambda is already in the list of already registered extensions).
When the context is already in a multi-threaded mode, this is guaranteed to assert.

Arguably a more structured registration mechanism for extensions with a unique ExtensionID
could be envisioned in the future.

In the process of cleaning this up, multiple usage inconsistencies surfaced around the
registration of translation extensions that this revision also cleans up.

Reviewed By: springerm

Differential Revision: https://reviews.llvm.org/D157703
2023-08-22 00:40:09 +00:00

372 lines
13 KiB
C++

//===- ObjectHandler.cpp - Implements base ObjectManager attributes -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the `OffloadingLLVMTranslationAttrInterface` for the
// `SelectObject` attribute.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Export.h"
#include "mlir/Target/LLVMIR/ModuleTranslation.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/FormatVariadic.h"
using namespace mlir;
namespace {
// Implementation of the `OffloadingLLVMTranslationAttrInterface` model.
class SelectObjectAttrImpl
: public gpu::OffloadingLLVMTranslationAttrInterface::FallbackModel<
SelectObjectAttrImpl> {
public:
// Translates a `gpu.binary`, embedding the binary into a host LLVM module as
// global binary string.
LogicalResult embedBinary(Attribute attribute, Operation *operation,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) const;
// Translates a `gpu.launch_func` to a sequence of LLVM instructions resulting
// in a kernel launch call.
LogicalResult launchKernel(Attribute attribute,
Operation *launchFuncOperation,
Operation *binaryOperation,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) const;
};
// Returns an identifier for the global string holding the binary.
std::string getBinaryIdentifier(StringRef binaryName) {
return binaryName.str() + "_bin_cst";
}
} // namespace
void mlir::gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(
DialectRegistry &registry) {
registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
SelectObjectAttr::attachInterface<SelectObjectAttrImpl>(*ctx);
});
}
LogicalResult SelectObjectAttrImpl::embedBinary(
Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) const {
assert(operation && "The binary operation must be non null.");
if (!operation)
return failure();
auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
if (!op) {
operation->emitError("Operation must be a GPU binary.");
return failure();
}
ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
// Obtain the index of the object to select.
int64_t index = -1;
if (Attribute target = cast<gpu::SelectObjectAttr>(attribute).getTarget()) {
// If the target attribute is a number it is the index. Otherwise compare
// the attribute to every target inside the object array to find the index.
if (auto indexAttr = mlir::dyn_cast<IntegerAttr>(target)) {
index = indexAttr.getInt();
} else {
for (auto [i, attr] : llvm::enumerate(objects)) {
auto obj = mlir::dyn_cast<gpu::ObjectAttr>(attr);
if (obj.getTarget() == target) {
index = i;
}
}
}
} else {
// If the target attribute is null then it's selecting the first object in
// the object array.
index = 0;
}
if (index < 0 || index >= static_cast<int64_t>(objects.size())) {
op->emitError("The requested target object couldn't be found.");
return failure();
}
auto object = mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
llvm::Module *module = moduleTranslation.getLLVMModule();
// Embed the object as a global string.
llvm::Constant *binary = llvm::ConstantDataArray::getString(
builder.getContext(), object.getObject().getValue(), false);
llvm::GlobalVariable *serializedObj =
new llvm::GlobalVariable(*module, binary->getType(), true,
llvm::GlobalValue::LinkageTypes::InternalLinkage,
binary, getBinaryIdentifier(op.getName()));
serializedObj->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
serializedObj->setAlignment(llvm::MaybeAlign(8));
serializedObj->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
return success();
}
namespace llvm {
namespace {
class LaunchKernel {
public:
LaunchKernel(Module &module, IRBuilderBase &builder,
mlir::LLVM::ModuleTranslation &moduleTranslation);
// Get the kernel launch callee.
FunctionCallee getKernelLaunchFn();
// Get the module function callee.
FunctionCallee getModuleFunctionFn();
// Get the module load callee.
FunctionCallee getModuleLoadFn();
// Get the module unload callee.
FunctionCallee getModuleUnloadFn();
// Get the stream create callee.
FunctionCallee getStreamCreateFn();
// Get the stream destroy callee.
FunctionCallee getStreamDestroyFn();
// Get the stream sync callee.
FunctionCallee getStreamSyncFn();
// Ger or create the function name global string.
Value *getOrCreateFunctionName(StringRef moduleName, StringRef kernelName);
// Create the void* kernel array for passing the arguments.
Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
// Create the full kernel launch.
mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op);
private:
Module &module;
IRBuilderBase &builder;
mlir::LLVM::ModuleTranslation &moduleTranslation;
Type *i32Ty{};
Type *voidTy{};
Type *intPtrTy{};
PointerType *ptrTy{};
};
} // namespace
} // namespace llvm
LogicalResult SelectObjectAttrImpl::launchKernel(
Attribute attribute, Operation *launchFuncOperation,
Operation *binaryOperation, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) const {
assert(launchFuncOperation && "The launch func operation must be non null.");
if (!launchFuncOperation)
return failure();
auto launchFuncOp = mlir::dyn_cast<gpu::LaunchFuncOp>(launchFuncOperation);
if (!launchFuncOp) {
launchFuncOperation->emitError("Operation must be a GPU launch func Op.");
return failure();
}
return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
moduleTranslation)
.createKernelLaunch(launchFuncOp);
}
llvm::LaunchKernel::LaunchKernel(
Module &module, IRBuilderBase &builder,
mlir::LLVM::ModuleTranslation &moduleTranslation)
: module(module), builder(builder), moduleTranslation(moduleTranslation) {
i32Ty = builder.getInt32Ty();
ptrTy = builder.getPtrTy(0);
voidTy = builder.getVoidTy();
intPtrTy = builder.getIntPtrTy(module.getDataLayout());
}
llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchFn() {
return module.getOrInsertFunction(
"mgpuLaunchKernel",
FunctionType::get(
voidTy,
ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
intPtrTy, intPtrTy, i32Ty, ptrTy, ptrTy, ptrTy}),
false));
}
llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
return module.getOrInsertFunction(
"mgpuModuleGetFunction",
FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, ptrTy}), false));
}
llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadFn() {
return module.getOrInsertFunction(
"mgpuModuleLoad",
FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy}), false));
}
llvm::FunctionCallee llvm::LaunchKernel::getModuleUnloadFn() {
return module.getOrInsertFunction(
"mgpuModuleUnload",
FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
}
llvm::FunctionCallee llvm::LaunchKernel::getStreamCreateFn() {
return module.getOrInsertFunction("mgpuStreamCreate",
FunctionType::get(ptrTy, false));
}
llvm::FunctionCallee llvm::LaunchKernel::getStreamDestroyFn() {
return module.getOrInsertFunction(
"mgpuStreamDestroy",
FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
}
llvm::FunctionCallee llvm::LaunchKernel::getStreamSyncFn() {
return module.getOrInsertFunction(
"mgpuStreamSynchronize",
FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
}
// Generates an LLVM IR dialect global that contains the name of the given
// kernel function as a C string, and returns a pointer to its beginning.
llvm::Value *llvm::LaunchKernel::getOrCreateFunctionName(StringRef moduleName,
StringRef kernelName) {
std::string globalName =
std::string(formatv("{0}_{1}_kernel_name", moduleName, kernelName));
if (GlobalVariable *gv = module.getGlobalVariable(globalName))
return gv;
return builder.CreateGlobalString(kernelName, globalName);
}
// Creates a struct containing all kernel parameters on the stack and returns
// an array of type-erased pointers to the fields of the struct. The array can
// then be passed to the CUDA / ROCm (HIP) kernel launch calls.
// The generated code is essentially as follows:
//
// %struct = alloca(sizeof(struct { Parameters... }))
// %array = alloca(NumParameters * sizeof(void *))
// for (i : [0, NumParameters))
// %fieldPtr = llvm.getelementptr %struct[0, i]
// llvm.store parameters[i], %fieldPtr
// %elementPtr = llvm.getelementptr %array[i]
// llvm.store %fieldPtr, %elementPtr
// return %array
llvm::Value *
llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
SmallVector<Value *> args =
moduleTranslation.lookupValues(op.getKernelOperands());
SmallVector<Type *> structTypes(args.size(), nullptr);
for (auto [i, arg] : llvm::enumerate(args))
structTypes[i] = arg->getType();
Type *structTy = StructType::create(module.getContext(), structTypes);
Value *argStruct = builder.CreateAlloca(structTy, 0u);
Value *argArray = builder.CreateAlloca(
ptrTy, ConstantInt::get(intPtrTy, structTypes.size()));
for (auto [i, arg] : enumerate(args)) {
Value *structMember = builder.CreateStructGEP(structTy, argStruct, i);
builder.CreateStore(arg, structMember);
Value *arrayMember = builder.CreateConstGEP1_32(ptrTy, argArray, i);
builder.CreateStore(structMember, arrayMember);
}
return argArray;
}
// Emits LLVM IR to launch a kernel function:
// %0 = call %binarygetter
// %1 = call %moduleLoad(%0)
// %2 = <see generateKernelNameConstant>
// %3 = call %moduleGetFunction(%1, %2)
// %4 = call %streamCreate()
// %5 = <see generateParamsArray>
// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
// call %streamSynchronize(%4)
// call %streamDestroy(%4)
// call %moduleUnload(%1)
mlir::LogicalResult
llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op) {
auto llvmValue = [&](mlir::Value value) -> Value * {
Value *v = moduleTranslation.lookupValue(value);
assert(v && "Value has not been translated.");
return v;
};
// Get grid dimensions.
mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues();
Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y),
*gz = llvmValue(grid.z);
// Get block dimensions.
mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues();
Value *bx = llvmValue(block.x), *by = llvmValue(block.y),
*bz = llvmValue(block.z);
// Get dynamic shared memory size.
Value *dynamicMemorySize = nullptr;
if (mlir::Value dynSz = op.getDynamicSharedMemorySize())
dynamicMemorySize = llvmValue(dynSz);
else
dynamicMemorySize = ConstantInt::get(i32Ty, 0);
// Create the argument array.
Value *argArray = createKernelArgArray(op);
// Load the kernel module.
StringRef moduleName = op.getKernelModuleName().getValue();
std::string binaryIdentifier = getBinaryIdentifier(moduleName);
Value *binary = module.getGlobalVariable(binaryIdentifier, true);
if (!binary)
return op.emitError() << "Couldn't find the binary: " << binaryIdentifier;
Value *moduleObject = builder.CreateCall(getModuleLoadFn(), {binary});
// Load the kernel function.
Value *moduleFunction = builder.CreateCall(
getModuleFunctionFn(),
{moduleObject,
getOrCreateFunctionName(moduleName, op.getKernelName().getValue())});
// Get the stream to use for execution. If there's no async object then create
// a stream to make a synchronous kernel launch.
Value *stream = nullptr;
bool handleStream = false;
if (mlir::Value asyncObject = op.getAsyncObject()) {
stream = llvmValue(asyncObject);
} else {
handleStream = true;
stream = builder.CreateCall(getStreamCreateFn(), {});
}
// Create the launch call.
Value *nullPtr = ConstantPointerNull::get(ptrTy);
builder.CreateCall(
getKernelLaunchFn(),
ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by, bz,
dynamicMemorySize, stream, argArray, nullPtr}));
// Sync & destroy the stream, for synchronous launches.
if (handleStream) {
builder.CreateCall(getStreamSyncFn(), {stream});
builder.CreateCall(getStreamDestroyFn(), {stream});
}
// Unload the kernel module.
builder.CreateCall(getModuleUnloadFn(), {moduleObject});
return success();
}