Files
clang-p2996/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
Krzysztof Drewniak d6ef3d20b4 [mlir] Remove VectorToROCDL
Between issues such as
https://github.com/llvm/llvm-project/issues/56323, the fact that this
lowering (unlike the code in amdgpu-to-rocdl) does not correctly set
up bounds checks (and thus will cause page faults on reads that might
need to be padded instead), and that fixing these problems would,
essentially, involve replicating amdgpu-to-rocdl, remove
--vector-to-rocdl for being broken. In addition, the lowering does not
support many aspects of transfer_{read,write}, like supervectors, and
may not work correctly in their presence.

We (the MLIR-based convolution generator at AMD) do not use this
conversion pass, nor are we aware of any other clients.

Migration strategies:
- Use VectorToLLVM
- If buffer ops are particularly needed in your application, use
amdgpu.raw_buffer_{load,store}

A VectorToAMDGPU pass may be introduced in the future.

Reviewed By: ThomasRaoux

Differential Revision: https://reviews.llvm.org/D129308
2022-07-12 15:21:22 +00:00

196 lines
9.1 KiB
C++

//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a pass to generate ROCDLIR operations for higher-level
// GPU operations.
//
//===----------------------------------------------------------------------===//
#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
#include "mlir/Conversion/ArithmeticToLLVM/ArithmeticToLLVM.h"
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/Support/FormatVariadic.h"
#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
#include "../GPUCommon/OpToFuncCallLowering.h"
#include "../PassDetail.h"
using namespace mlir;
namespace {
/// Import the GPU Ops to ROCDL Patterns.
#include "GPUToROCDL.cpp.inc"
// A pass that replaces all occurrences of GPU device operations with their
// corresponding ROCDL equivalent.
//
// This pass only handles device code and is not meant to be run on GPU host
// code.
struct LowerGpuOpsToROCDLOpsPass
: public ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
LowerGpuOpsToROCDLOpsPass() = default;
LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
gpu::amd::Runtime runtime) {
this->chipset = chipset;
this->indexBitwidth = indexBitwidth;
this->runtime = runtime;
}
void runOnOperation() override {
gpu::GPUModuleOp m = getOperation();
MLIRContext *ctx = m.getContext();
// Request C wrapper emission.
for (auto func : m.getOps<func::FuncOp>()) {
func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
UnitAttr::get(ctx));
}
FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
if (failed(maybeChipset)) {
emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
return signalPassFailure();
}
/// Customize the bitwidth used for the device side index computations.
LowerToLLVMOptions options(
ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
options.overrideIndexBitwidth(indexBitwidth);
LLVMTypeConverter converter(ctx, options);
RewritePatternSet patterns(ctx);
RewritePatternSet llvmPatterns(ctx);
populateGpuRewritePatterns(patterns);
(void)applyPatternsAndFoldGreedily(m, std::move(patterns));
mlir::arith::populateArithmeticToLLVMConversionPatterns(converter,
llvmPatterns);
populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
*maybeChipset);
populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
populateMemRefToLLVMConversionPatterns(converter, llvmPatterns);
populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
LLVMConversionTarget target(getContext());
configureGpuToROCDLConversionLegality(target);
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
signalPassFailure();
}
};
} // namespace
void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
target.addIllegalOp<func::FuncOp>();
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
target.addLegalDialect<ROCDL::ROCDLDialect>();
target.addIllegalDialect<gpu::GPUDialect>();
target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
LLVM::FCeilOp, LLVM::FFloorOp, LLVM::LogOp, LLVM::Log10Op,
LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp, LLVM::SqrtOp>();
// TODO: Remove once we support replacing non-root ops.
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
}
void mlir::populateGpuToROCDLConversionPatterns(
LLVMTypeConverter &converter, RewritePatternSet &patterns,
mlir::gpu::amd::Runtime runtime) {
using mlir::gpu::amd::Runtime;
populateWithGenerated(patterns);
patterns
.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
GPUReturnOpLowering>(converter);
patterns.add<GPUFuncOpLowering>(
converter, /*allocaAddrSpace=*/5,
StringAttr::get(&converter.getContext(),
ROCDL::ROCDLDialect::getKernelFuncAttrName()));
if (Runtime::HIP == runtime) {
patterns.add<GPUPrintfOpToHIPLowering>(converter);
} else if (Runtime::OpenCL == runtime) {
// Use address space = 4 to match the OpenCL definition of printf()
patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
}
patterns.add<OpToFuncCallLowering<math::AbsOp>>(converter, "__ocml_fabs_f32",
"__ocml_fabs_f64");
patterns.add<OpToFuncCallLowering<math::AtanOp>>(converter, "__ocml_atan_f32",
"__ocml_atan_f64");
patterns.add<OpToFuncCallLowering<math::Atan2Op>>(
converter, "__ocml_atan2_f32", "__ocml_atan2_f64");
patterns.add<OpToFuncCallLowering<math::CeilOp>>(converter, "__ocml_ceil_f32",
"__ocml_ceil_f64");
patterns.add<OpToFuncCallLowering<math::CosOp>>(converter, "__ocml_cos_f32",
"__ocml_cos_f64");
patterns.add<OpToFuncCallLowering<math::ExpOp>>(converter, "__ocml_exp_f32",
"__ocml_exp_f64");
patterns.add<OpToFuncCallLowering<math::Exp2Op>>(converter, "__ocml_exp2_f32",
"__ocml_exp2_f64");
patterns.add<OpToFuncCallLowering<math::ExpM1Op>>(
converter, "__ocml_expm1_f32", "__ocml_expm1_f64");
patterns.add<OpToFuncCallLowering<math::FloorOp>>(
converter, "__ocml_floor_f32", "__ocml_floor_f64");
patterns.add<OpToFuncCallLowering<math::LogOp>>(converter, "__ocml_log_f32",
"__ocml_log_f64");
patterns.add<OpToFuncCallLowering<math::Log10Op>>(
converter, "__ocml_log10_f32", "__ocml_log10_f64");
patterns.add<OpToFuncCallLowering<math::Log1pOp>>(
converter, "__ocml_log1p_f32", "__ocml_log1p_f64");
patterns.add<OpToFuncCallLowering<math::Log2Op>>(converter, "__ocml_log2_f32",
"__ocml_log2_f64");
patterns.add<OpToFuncCallLowering<math::PowFOp>>(converter, "__ocml_pow_f32",
"__ocml_pow_f64");
patterns.add<OpToFuncCallLowering<math::RsqrtOp>>(
converter, "__ocml_rsqrt_f32", "__ocml_rsqrt_f64");
patterns.add<OpToFuncCallLowering<math::SinOp>>(converter, "__ocml_sin_f32",
"__ocml_sin_f64");
patterns.add<OpToFuncCallLowering<math::SqrtOp>>(converter, "__ocml_sqrt_f32",
"__ocml_sqrt_f64");
patterns.add<OpToFuncCallLowering<math::TanhOp>>(converter, "__ocml_tanh_f32",
"__ocml_tanh_f64");
}
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
unsigned indexBitwidth,
gpu::amd::Runtime runtime) {
return std::make_unique<LowerGpuOpsToROCDLOpsPass>(chipset, indexBitwidth,
runtime);
}