[mlir][AMDGPU] Remove buffer ops that are statically out of bounds
When the bounds check attribute is true, the raw buffer load, store, and atomic operations have well-defined behavior (returning 0 for loads and ignoring stores) when the buffer access exceeds the bounds of the memory being accessed. Because of how LLVM currently implements these buffer operations (as opaque intrinsics), the backend cannot optimize out this known behavior and eliminate the memory operations. Therefore, use MLIR's canonicalization system to eliminate these operations. Reviewed By: nirvedhmeshram Differential Revision: https://reviews.llvm.org/D138146
This commit is contained in:
@@ -23,6 +23,10 @@ def AMDGPU_Dialect : Dialect {
|
||||
that will eventually be executed on AMD hardware.
|
||||
}];
|
||||
|
||||
|
||||
let dependentDialects = [
|
||||
"arith::ArithDialect"
|
||||
];
|
||||
let useDefaultAttributePrinterParser = 1;
|
||||
}
|
||||
|
||||
@@ -83,6 +87,7 @@ def AMDGPU_RawBufferLoadOp :
|
||||
(`sgprOffset` $sgprOffset^)? `:`
|
||||
type($memref) `,` type($indices) `->` type($value)
|
||||
}];
|
||||
let hasCanonicalizer = 1;
|
||||
let hasVerifier = 1;
|
||||
}
|
||||
|
||||
@@ -124,6 +129,7 @@ def AMDGPU_RawBufferStoreOp :
|
||||
(`sgprOffset` $sgprOffset^)? `:`
|
||||
type($value) `->` type($memref) `,` type($indices)
|
||||
}];
|
||||
let hasCanonicalizer = 1;
|
||||
let hasVerifier = 1;
|
||||
}
|
||||
|
||||
@@ -162,6 +168,7 @@ def AMDGPU_RawBufferAtomicFaddOp :
|
||||
(`sgprOffset` $sgprOffset^)? `:`
|
||||
type($value) `->` type($memref) `,` type($indices)
|
||||
}];
|
||||
let hasCanonicalizer = 1;
|
||||
let hasVerifier = 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -12,14 +12,19 @@
|
||||
|
||||
#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
|
||||
|
||||
#include "mlir/Dialect/Arith/IR/Arith.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/Diagnostics.h"
|
||||
#include "mlir/IR/DialectImplementation.h"
|
||||
#include "mlir/IR/Matchers.h"
|
||||
#include "mlir/IR/OpImplementation.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/TypeUtilities.h"
|
||||
#include "llvm/ADT/TypeSwitch.h"
|
||||
|
||||
#include <limits>
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::amdgpu;
|
||||
|
||||
@@ -62,6 +67,96 @@ LogicalResult RawBufferAtomicFaddOp::verify() {
|
||||
return verifyRawBufferOp(*this);
|
||||
}
|
||||
|
||||
static Optional<uint32_t> getConstantUint32(Value v) {
|
||||
APInt cst;
|
||||
if (!v.getType().isInteger(32))
|
||||
return None;
|
||||
if (matchPattern(v, m_ConstantInt(&cst)))
|
||||
return cst.getZExtValue();
|
||||
return None;
|
||||
}
|
||||
|
||||
template <typename OpType>
|
||||
static bool staticallyOutOfBounds(OpType op) {
|
||||
if (!op.getBoundsCheck())
|
||||
return false;
|
||||
MemRefType bufferType = op.getMemref().getType();
|
||||
if (!bufferType.hasStaticShape())
|
||||
return false;
|
||||
int64_t offset;
|
||||
SmallVector<int64_t> strides;
|
||||
if (failed(getStridesAndOffset(bufferType, strides, offset)))
|
||||
return false;
|
||||
int64_t result = offset + op.getIndexOffset().value_or(0);
|
||||
if (op.getSgprOffset()) {
|
||||
Optional<uint32_t> sgprOffset = getConstantUint32(op.getSgprOffset());
|
||||
if (!sgprOffset)
|
||||
return false;
|
||||
result += *sgprOffset;
|
||||
}
|
||||
if (strides.size() != op.getIndices().size())
|
||||
return false;
|
||||
int64_t indexVal = 0;
|
||||
for (auto pair : llvm::zip(strides, op.getIndices())) {
|
||||
int64_t stride = std::get<0>(pair);
|
||||
Value idx = std::get<1>(pair);
|
||||
Optional<uint32_t> idxVal = getConstantUint32(idx);
|
||||
if (!idxVal)
|
||||
return false;
|
||||
indexVal += stride * idxVal.value();
|
||||
}
|
||||
result += indexVal;
|
||||
if (result > std::numeric_limits<uint32_t>::max())
|
||||
// Overflow means don't drop
|
||||
return false;
|
||||
return result >= bufferType.getNumElements();
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct RemoveStaticallyOobBufferLoads final
|
||||
: public OpRewritePattern<RawBufferLoadOp> {
|
||||
using OpRewritePattern<RawBufferLoadOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(RawBufferLoadOp op,
|
||||
PatternRewriter &rw) const override {
|
||||
if (!staticallyOutOfBounds(op))
|
||||
return failure();
|
||||
Type loadType = op.getResult().getType();
|
||||
rw.replaceOpWithNewOp<arith::ConstantOp>(op, loadType,
|
||||
rw.getZeroAttr(loadType));
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename OpType>
|
||||
struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> {
|
||||
using OpRewritePattern<OpType>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override {
|
||||
if (!staticallyOutOfBounds(op))
|
||||
return failure();
|
||||
|
||||
rw.eraseOp(op);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
} // end namespace
|
||||
|
||||
void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results,
|
||||
MLIRContext *context) {
|
||||
results.add<RemoveStaticallyOobBufferLoads>(context);
|
||||
}
|
||||
|
||||
void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
|
||||
MLIRContext *context) {
|
||||
results.add<RemoveStaticallyOobBufferWrites<RawBufferStoreOp>>(context);
|
||||
}
|
||||
|
||||
void RawBufferAtomicFaddOp::getCanonicalizationPatterns(
|
||||
RewritePatternSet &results, MLIRContext *context) {
|
||||
results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicFaddOp>>(context);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// MFMAOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
@@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIRAMDGPUDialect
|
||||
MLIRAMDGPUIncGen
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRArithDialect
|
||||
MLIRIR
|
||||
MLIRSideEffectInterfaces
|
||||
)
|
||||
|
||||
132
mlir/test/Dialect/AMDGPU/canonicalize.mlir
Normal file
132
mlir/test/Dialect/AMDGPU/canonicalize.mlir
Normal file
@@ -0,0 +1,132 @@
|
||||
// RUN: mlir-opt %s -split-input-file -canonicalize | FileCheck %s
|
||||
|
||||
// CHECK-LABEL: func @known_oob_load
|
||||
func.func @known_oob_load(%arg0: memref<4xf32>) -> f32 {
|
||||
// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
|
||||
// CHECK: return %[[zero]]
|
||||
%c4_i32 = arith.constant 4 : i32
|
||||
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
|
||||
func.return %0 : f32
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @known_oob_load_2d
|
||||
func.func @known_oob_load_2d(%arg0: memref<4x4xf32>) -> f32 {
|
||||
// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
|
||||
// CHECK: return %[[zero]]
|
||||
%c0_i32 = arith.constant 0 : i32
|
||||
%c4_i32 = arith.constant 4 : i32
|
||||
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32, %c0_i32] : memref<4x4xf32>, i32, i32 -> f32
|
||||
func.return %0 : f32
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @known_oob_load_2d_on_last
|
||||
func.func @known_oob_load_2d_on_last(%arg0: memref<4x4xf32>) -> f32 {
|
||||
// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
|
||||
// CHECK: return %[[zero]]
|
||||
%c0_i32 = arith.constant 0 : i32
|
||||
%c16_i32 = arith.constant 16 : i32
|
||||
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c16_i32] : memref<4x4xf32>, i32, i32 -> f32
|
||||
func.return %0 : f32
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @known_oob_load_index
|
||||
func.func @known_oob_load_index(%arg0: memref<4xf32>) -> f32 {
|
||||
// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
|
||||
// CHECK: return %[[zero]]
|
||||
%c0_i32 = arith.constant 0 : i32
|
||||
%0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 4 : i32} %arg0[%c0_i32] : memref<4xf32>, i32 -> f32
|
||||
func.return %0 : f32
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @known_oob_load_sgproffset
|
||||
func.func @known_oob_load_sgproffset(%arg0: memref<4xf32>) -> f32 {
|
||||
// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
|
||||
// CHECK: return %[[zero]]
|
||||
%c2_i32 = arith.constant 2 : i32
|
||||
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c2_i32] sgprOffset %c2_i32 : memref<4xf32>, i32 -> f32
|
||||
func.return %0 : f32
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @unknown_load
|
||||
func.func @unknown_load(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
|
||||
// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
|
||||
// CHECK: return %[[loaded]]
|
||||
%c4_i32 = arith.constant 4 : i32
|
||||
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%arg1] sgprOffset %c4_i32 : memref<4xf32>, i32 -> f32
|
||||
func.return %0 : f32
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @unknown_load_sgproffset
|
||||
func.func @unknown_load_sgproffset(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
|
||||
// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
|
||||
// CHECK: return %[[loaded]]
|
||||
%c4_i32 = arith.constant 4 : i32
|
||||
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] sgprOffset %arg1 : memref<4xf32>, i32 -> f32
|
||||
func.return %0 : f32
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @unranked
|
||||
func.func @unranked(%arg0: memref<?xf32>) -> f32 {
|
||||
// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
|
||||
// CHECK: return %[[loaded]]
|
||||
%c4_i32 = arith.constant 4 : i32
|
||||
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<?xf32>, i32 -> f32
|
||||
func.return %0 : f32
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @no_oob_check
|
||||
func.func @no_oob_check(%arg0: memref<4xf32>) -> f32 {
|
||||
// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
|
||||
// CHECK: return %[[loaded]]
|
||||
%c4_i32 = arith.constant 4 : i32
|
||||
%0 = amdgpu.raw_buffer_load {boundsCheck = false} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
|
||||
func.return %0 : f32
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @in_bounds_overall
|
||||
func.func @in_bounds_overall(%arg0: memref<4x4xf32>) -> f32 {
|
||||
// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
|
||||
// CHECK: return %[[loaded]]
|
||||
%c0_i32 = arith.constant 0 : i32
|
||||
%c15_i32 = arith.constant 15 : i32
|
||||
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c15_i32] : memref<4x4xf32>, i32, i32 -> f32
|
||||
func.return %0 : f32
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @dead_store
|
||||
func.func @dead_store(%arg0: memref<4xf32>, %arg1: f32) {
|
||||
// CHECK-NOT: amdgpu.raw_buffer_store
|
||||
%c4_i32 = arith.constant 4 : i32
|
||||
amdgpu.raw_buffer_store {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
|
||||
func.return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @dead_atomic_add
|
||||
func.func @dead_atomic_add(%arg0: memref<4xf32>, %arg1: f32) {
|
||||
// CHECK-NOT: amdgpu.raw_buffer_atomic_fadd
|
||||
%c4_i32 = arith.constant 4 : i32
|
||||
amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
|
||||
func.return
|
||||
}
|
||||
Reference in New Issue
Block a user