Files
clang-p2996/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
Michele Scuttari 67d0d7ac0a [MLIR] Update pass declarations to new autogenerated files
The patch introduces the required changes to update the pass declarations and definitions to use the new autogenerated files and allow dropping the old infrastructure.

Reviewed By: mehdi_amini, rriddle

Differential Review: https://reviews.llvm.org/D132838
2022-08-31 12:28:45 +02:00

234 lines
10 KiB
C++

//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a pass to generate ROCDLIR operations for higher-level
// GPU operations.
//
//===----------------------------------------------------------------------===//
#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
#include "mlir/Conversion/ArithmeticToLLVM/ArithmeticToLLVM.h"
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/Support/FormatVariadic.h"
#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
#include "../GPUCommon/OpToFuncCallLowering.h"
namespace mlir {
#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
#include "mlir/Conversion/Passes.h.inc"
} // namespace mlir
using namespace mlir;
/// Returns true if the given `gpu.func` can be safely called using the bare
/// pointer calling convention.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
bool canBeBare = true;
for (Type type : func.getArgumentTypes())
if (auto memrefTy = type.dyn_cast<BaseMemRefType>())
canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
return canBeBare;
}
namespace {
/// Import the GPU Ops to ROCDL Patterns.
#include "GPUToROCDL.cpp.inc"
// A pass that replaces all occurrences of GPU device operations with their
// corresponding ROCDL equivalent.
//
// This pass only handles device code and is not meant to be run on GPU host
// code.
struct LowerGpuOpsToROCDLOpsPass
: public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
LowerGpuOpsToROCDLOpsPass() = default;
LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
bool useBarePtrCallConv,
gpu::amd::Runtime runtime) {
if (this->chipset.getNumOccurrences() == 0)
this->chipset = chipset;
if (this->indexBitwidth.getNumOccurrences() == 0)
this->indexBitwidth = indexBitwidth;
if (this->useBarePtrCallConv.getNumOccurrences() == 0)
this->useBarePtrCallConv = useBarePtrCallConv;
if (this->runtime.getNumOccurrences() == 0)
this->runtime = runtime;
}
void runOnOperation() override {
gpu::GPUModuleOp m = getOperation();
MLIRContext *ctx = m.getContext();
// Request C wrapper emission.
for (auto func : m.getOps<func::FuncOp>()) {
func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
UnitAttr::get(ctx));
}
FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
if (failed(maybeChipset)) {
emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
return signalPassFailure();
}
/// Customize the bitwidth used for the device side index computations.
LowerToLLVMOptions options(
ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
options.overrideIndexBitwidth(indexBitwidth);
if (useBarePtrCallConv) {
options.useBarePtrCallConv = true;
WalkResult canUseBarePointers =
m.walk([](gpu::GPUFuncOp func) -> WalkResult {
if (canBeCalledWithBarePointers(func))
return WalkResult::advance();
return WalkResult::interrupt();
});
if (canUseBarePointers.wasInterrupted()) {
emitError(UnknownLoc::get(ctx),
"bare pointer calling convention requires all memrefs to "
"have static shape and use the identity map");
return signalPassFailure();
}
}
LLVMTypeConverter converter(ctx, options);
RewritePatternSet patterns(ctx);
RewritePatternSet llvmPatterns(ctx);
populateGpuRewritePatterns(patterns);
(void)applyPatternsAndFoldGreedily(m, std::move(patterns));
mlir::arith::populateArithmeticToLLVMConversionPatterns(converter,
llvmPatterns);
populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
*maybeChipset);
populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
populateMemRefToLLVMConversionPatterns(converter, llvmPatterns);
populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
LLVMConversionTarget target(getContext());
configureGpuToROCDLConversionLegality(target);
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
signalPassFailure();
}
};
} // namespace
void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
target.addIllegalOp<func::FuncOp>();
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
target.addLegalDialect<ROCDL::ROCDLDialect>();
target.addIllegalDialect<gpu::GPUDialect>();
target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
LLVM::FCeilOp, LLVM::FFloorOp, LLVM::LogOp, LLVM::Log10Op,
LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp, LLVM::SqrtOp>();
// TODO: Remove once we support replacing non-root ops.
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
}
void mlir::populateGpuToROCDLConversionPatterns(
LLVMTypeConverter &converter, RewritePatternSet &patterns,
mlir::gpu::amd::Runtime runtime) {
using mlir::gpu::amd::Runtime;
populateWithGenerated(patterns);
patterns
.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
GPUReturnOpLowering>(converter);
patterns.add<GPUFuncOpLowering>(
converter, /*allocaAddrSpace=*/5,
StringAttr::get(&converter.getContext(),
ROCDL::ROCDLDialect::getKernelFuncAttrName()));
if (Runtime::HIP == runtime) {
patterns.add<GPUPrintfOpToHIPLowering>(converter);
} else if (Runtime::OpenCL == runtime) {
// Use address space = 4 to match the OpenCL definition of printf()
patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
}
patterns.add<OpToFuncCallLowering<math::AbsFOp>>(converter, "__ocml_fabs_f32",
"__ocml_fabs_f64");
patterns.add<OpToFuncCallLowering<math::AtanOp>>(converter, "__ocml_atan_f32",
"__ocml_atan_f64");
patterns.add<OpToFuncCallLowering<math::Atan2Op>>(
converter, "__ocml_atan2_f32", "__ocml_atan2_f64");
patterns.add<OpToFuncCallLowering<math::CeilOp>>(converter, "__ocml_ceil_f32",
"__ocml_ceil_f64");
patterns.add<OpToFuncCallLowering<math::CosOp>>(converter, "__ocml_cos_f32",
"__ocml_cos_f64");
patterns.add<OpToFuncCallLowering<math::ExpOp>>(converter, "__ocml_exp_f32",
"__ocml_exp_f64");
patterns.add<OpToFuncCallLowering<math::Exp2Op>>(converter, "__ocml_exp2_f32",
"__ocml_exp2_f64");
patterns.add<OpToFuncCallLowering<math::ExpM1Op>>(
converter, "__ocml_expm1_f32", "__ocml_expm1_f64");
patterns.add<OpToFuncCallLowering<math::FloorOp>>(
converter, "__ocml_floor_f32", "__ocml_floor_f64");
patterns.add<OpToFuncCallLowering<math::LogOp>>(converter, "__ocml_log_f32",
"__ocml_log_f64");
patterns.add<OpToFuncCallLowering<math::Log10Op>>(
converter, "__ocml_log10_f32", "__ocml_log10_f64");
patterns.add<OpToFuncCallLowering<math::Log1pOp>>(
converter, "__ocml_log1p_f32", "__ocml_log1p_f64");
patterns.add<OpToFuncCallLowering<math::Log2Op>>(converter, "__ocml_log2_f32",
"__ocml_log2_f64");
patterns.add<OpToFuncCallLowering<math::PowFOp>>(converter, "__ocml_pow_f32",
"__ocml_pow_f64");
patterns.add<OpToFuncCallLowering<math::RsqrtOp>>(
converter, "__ocml_rsqrt_f32", "__ocml_rsqrt_f64");
patterns.add<OpToFuncCallLowering<math::SinOp>>(converter, "__ocml_sin_f32",
"__ocml_sin_f64");
patterns.add<OpToFuncCallLowering<math::SqrtOp>>(converter, "__ocml_sqrt_f32",
"__ocml_sqrt_f64");
patterns.add<OpToFuncCallLowering<math::TanhOp>>(converter, "__ocml_tanh_f32",
"__ocml_tanh_f64");
}
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
unsigned indexBitwidth,
bool useBarePtrCallConv,
gpu::amd::Runtime runtime) {
return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
chipset, indexBitwidth, useBarePtrCallConv, runtime);
}