Files
clang-p2996/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
Alexander Richardson 07e2ba445d [AMDGPU] Set AS8 address width to 48 bits
Of the 128-bits of buffer descriptor only 48 bits are address bits, so
following the discussion on https://discourse.llvm.org/t/clarifiying-the-semantics-of-ptrtoint/83987/54,
the logic conclusion is to set the index width to 48 bits instead of
the current value of 128.

Most of the test changes are mechanical datalayout updates, but there
is one actual change: the ptrmask test now uses .i48 instead of .i128
and I had to update SelectionDAGBuilder to correctly extend the mask.

Reviewed By: krzysz00

Pull Request: https://github.com/llvm/llvm-project/pull/139419
2025-05-19 17:26:05 -07:00

473 lines
20 KiB
C++

//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a pass to generate ROCDLIR operations for higher-level
// GPU operations.
//
//===----------------------------------------------------------------------===//
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
#include "mlir/Dialect/Arith/Transforms/Passes.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
#include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
#include "mlir/Conversion/MathToROCDL/MathToROCDL.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/Support/FormatVariadic.h"
#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
namespace mlir {
#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
#include "mlir/Conversion/Passes.h.inc"
} // namespace mlir
using namespace mlir;
// Truncate or extend the result depending on the index bitwidth specified
// by the LLVMTypeConverter options.
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
Location loc, Value value,
const LLVMTypeConverter &converter) {
int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
int64_t indexBitwidth = converter.getIndexTypeBitwidth();
auto indexBitwidthType =
IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
// TODO: use <=> in C++20.
if (indexBitwidth > intWidth) {
return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
}
if (indexBitwidth < intWidth) {
return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
}
return value;
}
/// Returns true if the given `gpu.func` can be safely called using the bare
/// pointer calling convention.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
bool canBeBare = true;
for (Type type : func.getArgumentTypes())
if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
return canBeBare;
}
static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
const unsigned indexBitwidth) {
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
ValueRange{minus1, zero});
Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
ValueRange{minus1, mbcntLo});
return laneId;
}
static constexpr StringLiteral amdgcnDataLayout =
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
"32-v32:"
"32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
"64-S32-A5-G1-ni:7:8:9";
namespace {
struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
LogicalResult
matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
auto loc = op->getLoc();
MLIRContext *context = rewriter.getContext();
// convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
// followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
Type intTy = IntegerType::get(context, 32);
Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
Value mbcntLo =
rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
loc, intTy, ValueRange{minus1, mbcntLo});
// Truncate or extend the result depending on the index bitwidth specified
// by the LLVMTypeConverter options.
const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
if (indexBitwidth > 32) {
laneId = rewriter.create<LLVM::SExtOp>(
loc, IntegerType::get(context, indexBitwidth), laneId);
} else if (indexBitwidth < 32) {
laneId = rewriter.create<LLVM::TruncOp>(
loc, IntegerType::get(context, indexBitwidth), laneId);
}
rewriter.replaceOp(op, {laneId});
return success();
}
};
struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
amdgpu::Chipset chipset)
: ConvertOpToLLVMPattern<gpu::SubgroupSizeOp>(converter),
chipset(chipset) {}
LogicalResult
matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
LLVM::ConstantRangeAttr bounds = nullptr;
bool isBeforeGfx10 = chipset.majorVersion < 10;
if (auto upperBoundAttr = op.getUpperBoundAttr()) {
bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
/*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
/*upper=*/op.getUpperBoundAttr().getInt() + 1);
}
Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
op.getLoc(), rewriter.getI32Type(), bounds);
wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
*getTypeConverter());
rewriter.replaceOp(op, {wavefrontOp});
return success();
}
const amdgpu::Chipset chipset;
};
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
/// Lowers a shuffle to the corresponding ROCDL ops.
///
/// Use the `width` argument to see if src lane is participating.
/// If not the dstLane would be itself.
///
/// Shuffle with DS Bpermute:
/// let shflMode = [xor, up, down, idx]
/// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
/// 1. curLaneId = using mbcnt.lo + mbcnt.hi
/// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
/// 3. dstLane = shflMode(curLaneId, step)
/// 4. isActiveSrcLane = dstLane < isActiveSrcLane
/// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
/// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
/// 7. bpermute(dwordAlignedDstLane, shfl_value).
///
LogicalResult
matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Location loc = op->getLoc();
Value initShflValue = adaptor.getValue();
const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
Value width = adaptor.getWidth();
Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
Value widthOrZeroIfOutside =
rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
Value dstLane;
switch (op.getMode()) {
case gpu::ShuffleMode::UP:
dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId,
adaptor.getOffset());
break;
case gpu::ShuffleMode::DOWN:
dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
adaptor.getOffset());
break;
case gpu::ShuffleMode::XOR:
dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
adaptor.getOffset());
break;
case gpu::ShuffleMode::IDX:
dstLane = adaptor.getOffset();
break;
}
Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
dstLane, srcLaneId);
Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
Value dwordAlignedDstLane =
rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
SmallVector<Value> decomposed =
LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type);
SmallVector<Value> swizzled;
for (Value v : decomposed) {
Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type,
dwordAlignedDstLane, v);
swizzled.emplace_back(res);
}
Value shflValue =
LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType());
rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
return success();
}
};
/// Import the GPU Ops to ROCDL Patterns.
#include "GPUToROCDL.cpp.inc"
// A pass that replaces all occurrences of GPU device operations with their
// corresponding ROCDL equivalent.
//
// This pass only handles device code and is not meant to be run on GPU host
// code.
struct LowerGpuOpsToROCDLOpsPass final
: public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
LowerGpuOpsToROCDLOpsPass() = default;
LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
bool useBarePtrCallConv,
gpu::amd::Runtime runtime) {
if (this->chipset.getNumOccurrences() == 0)
this->chipset = chipset;
if (this->indexBitwidth.getNumOccurrences() == 0)
this->indexBitwidth = indexBitwidth;
if (this->useBarePtrCallConv.getNumOccurrences() == 0)
this->useBarePtrCallConv = useBarePtrCallConv;
if (this->runtime.getNumOccurrences() == 0)
this->runtime = runtime;
}
void getDependentDialects(DialectRegistry &registry) const override {
Base::getDependentDialects(registry);
registerConvertToLLVMDependentDialectLoading(registry);
}
void runOnOperation() override {
gpu::GPUModuleOp m = getOperation();
MLIRContext *ctx = m.getContext();
auto llvmDataLayout = m->getAttrOfType<StringAttr>(
LLVM::LLVMDialect::getDataLayoutAttrName());
if (!llvmDataLayout) {
llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
}
// Request C wrapper emission.
for (auto func : m.getOps<func::FuncOp>()) {
func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
UnitAttr::get(ctx));
}
FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
if (failed(maybeChipset)) {
emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
return signalPassFailure();
}
/// Customize the bitwidth used for the device side index computations.
LowerToLLVMOptions options(
ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
options.overrideIndexBitwidth(indexBitwidth);
if (useBarePtrCallConv) {
options.useBarePtrCallConv = true;
WalkResult canUseBarePointers =
m.walk([](gpu::GPUFuncOp func) -> WalkResult {
if (canBeCalledWithBarePointers(func))
return WalkResult::advance();
return WalkResult::interrupt();
});
if (canUseBarePointers.wasInterrupted()) {
emitError(UnknownLoc::get(ctx),
"bare pointer calling convention requires all memrefs to "
"have static shape and use the identity map");
return signalPassFailure();
}
}
// Apply in-dialect lowering. In-dialect lowering will replace
// ops which need to be lowered further, which is not supported by a
// single conversion pass.
{
RewritePatternSet patterns(ctx);
populateGpuRewritePatterns(patterns);
populateGpuPromoteShuffleToAMDGPUPatterns(patterns);
(void)applyPatternsGreedily(m, std::move(patterns));
}
LLVMTypeConverter converter(ctx, options);
populateGpuMemorySpaceAttributeConversions(
converter, [](gpu::AddressSpace space) {
switch (space) {
case gpu::AddressSpace::Global:
return 1;
case gpu::AddressSpace::Workgroup:
return 3;
case gpu::AddressSpace::Private:
return 5;
}
llvm_unreachable("unknown address space enum value");
return 0;
});
RewritePatternSet llvmPatterns(ctx);
LLVMConversionTarget target(getContext());
llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
allowedDialects.end());
for (Dialect *dialect : ctx->getLoadedDialects()) {
bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
// Empty `allowedDialectsSet` means all dialects are allowed.
if (!allowedDialectsSet.empty() && !allowed)
continue;
auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
if (!iface) {
// Error out if dialect was explicily specified but doesn't implement
// conversion interface.
if (allowed) {
m.emitError()
<< "dialect does not implement ConvertToLLVMPatternInterface: "
<< dialect->getNamespace();
return signalPassFailure();
}
continue;
}
iface->populateConvertToLLVMConversionPatterns(target, converter,
llvmPatterns);
}
populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
*maybeChipset);
populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
*maybeChipset);
configureGpuToROCDLConversionLegality(target);
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
signalPassFailure();
auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
auto reqdWorkGroupSizeAttrHelper =
rocdlDialect->getReqdWorkGroupSizeAttrHelper();
auto flatWorkGroupSizeAttrHelper =
rocdlDialect->getFlatWorkGroupSizeAttrHelper();
// Manually rewrite known block size attributes so the LLVMIR translation
// infrastructure can pick them up.
m.walk([&](LLVM::LLVMFuncOp op) {
if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
// Also set up the rocdl.flat_work_group_size attribute to prevent
// conflicting metadata.
uint32_t flatSize = 1;
for (uint32_t size : blockSizes.asArrayRef()) {
flatSize *= size;
}
StringAttr flatSizeAttr =
StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
}
});
}
};
} // namespace
void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
target.addIllegalOp<func::FuncOp>();
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
target.addLegalDialect<ROCDL::ROCDLDialect>();
target.addIllegalDialect<gpu::GPUDialect>();
target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
// These ops are legal for f32 type.
target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
});
// TODO: Remove once we support replacing non-root ops.
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
}
void mlir::populateGpuToROCDLConversionPatterns(
const LLVMTypeConverter &converter, RewritePatternSet &patterns,
mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) {
using gpu::index_lowering::IndexKind;
using gpu::index_lowering::IntrType;
using mlir::gpu::amd::Runtime;
auto *rocdlDialect =
converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
populateWithGenerated(patterns);
patterns.add<
gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
converter, IndexKind::Block, IntrType::Id);
patterns.add<gpu::index_lowering::OpLowering<
gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
converter, IndexKind::Grid, IntrType::Id);
patterns.add<
gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
converter, IndexKind::Block, IntrType::Dim);
patterns.add<gpu::index_lowering::OpLowering<
gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
converter, IndexKind::Grid, IntrType::Dim);
patterns.add<GPUReturnOpLowering>(converter);
patterns.add<GPUFuncOpLowering>(
converter,
GPUFuncOpLoweringOptions{
/*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
/*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
rocdlDialect->getKernelAttrHelper().getName(),
rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
if (Runtime::HIP == runtime) {
patterns.add<GPUPrintfOpToHIPLowering>(converter);
} else if (Runtime::OpenCL == runtime) {
// Use address space = 4 to match the OpenCL definition of printf()
patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
}
// TODO: Add alignment for workgroup memory
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
populateMathToROCDLConversionPatterns(converter, patterns);
}
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
unsigned indexBitwidth,
bool useBarePtrCallConv,
gpu::amd::Runtime runtime) {
return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
chipset, indexBitwidth, useBarePtrCallConv, runtime);
}