[mlir][AMDGPU] Remove buffer ops that are statically out of bounds

When the bounds check attribute is true, the raw buffer load, store,
and atomic operations have well-defined behavior (returning 0 for
loads and ignoring stores) when the buffer access exceeds the bounds
of the memory being accessed.

Because of how LLVM currently implements these buffer operations (as
opaque intrinsics), the backend cannot optimize out this known
behavior and eliminate the memory operations. Therefore, use MLIR's
canonicalization system to eliminate these operations.

Reviewed By: nirvedhmeshram

Differential Revision: https://reviews.llvm.org/D138146
This commit is contained in:
Krzysztof Drewniak
2022-11-09 22:26:33 +00:00
parent 3f9d64a2ad
commit d6abdf46bc
4 changed files with 235 additions and 0 deletions

View File

@@ -23,6 +23,10 @@ def AMDGPU_Dialect : Dialect {
that will eventually be executed on AMD hardware.
}];
let dependentDialects = [
"arith::ArithDialect"
];
let useDefaultAttributePrinterParser = 1;
}
@@ -83,6 +87,7 @@ def AMDGPU_RawBufferLoadOp :
(`sgprOffset` $sgprOffset^)? `:`
type($memref) `,` type($indices) `->` type($value)
}];
let hasCanonicalizer = 1;
let hasVerifier = 1;
}
@@ -124,6 +129,7 @@ def AMDGPU_RawBufferStoreOp :
(`sgprOffset` $sgprOffset^)? `:`
type($value) `->` type($memref) `,` type($indices)
}];
let hasCanonicalizer = 1;
let hasVerifier = 1;
}
@@ -162,6 +168,7 @@ def AMDGPU_RawBufferAtomicFaddOp :
(`sgprOffset` $sgprOffset^)? `:`
type($value) `->` type($memref) `,` type($indices)
}];
let hasCanonicalizer = 1;
let hasVerifier = 1;
}

View File

@@ -12,14 +12,19 @@
#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
#include "llvm/ADT/TypeSwitch.h"
#include <limits>
using namespace mlir;
using namespace mlir::amdgpu;
@@ -62,6 +67,96 @@ LogicalResult RawBufferAtomicFaddOp::verify() {
return verifyRawBufferOp(*this);
}
static Optional<uint32_t> getConstantUint32(Value v) {
APInt cst;
if (!v.getType().isInteger(32))
return None;
if (matchPattern(v, m_ConstantInt(&cst)))
return cst.getZExtValue();
return None;
}
template <typename OpType>
static bool staticallyOutOfBounds(OpType op) {
if (!op.getBoundsCheck())
return false;
MemRefType bufferType = op.getMemref().getType();
if (!bufferType.hasStaticShape())
return false;
int64_t offset;
SmallVector<int64_t> strides;
if (failed(getStridesAndOffset(bufferType, strides, offset)))
return false;
int64_t result = offset + op.getIndexOffset().value_or(0);
if (op.getSgprOffset()) {
Optional<uint32_t> sgprOffset = getConstantUint32(op.getSgprOffset());
if (!sgprOffset)
return false;
result += *sgprOffset;
}
if (strides.size() != op.getIndices().size())
return false;
int64_t indexVal = 0;
for (auto pair : llvm::zip(strides, op.getIndices())) {
int64_t stride = std::get<0>(pair);
Value idx = std::get<1>(pair);
Optional<uint32_t> idxVal = getConstantUint32(idx);
if (!idxVal)
return false;
indexVal += stride * idxVal.value();
}
result += indexVal;
if (result > std::numeric_limits<uint32_t>::max())
// Overflow means don't drop
return false;
return result >= bufferType.getNumElements();
}
namespace {
struct RemoveStaticallyOobBufferLoads final
: public OpRewritePattern<RawBufferLoadOp> {
using OpRewritePattern<RawBufferLoadOp>::OpRewritePattern;
LogicalResult matchAndRewrite(RawBufferLoadOp op,
PatternRewriter &rw) const override {
if (!staticallyOutOfBounds(op))
return failure();
Type loadType = op.getResult().getType();
rw.replaceOpWithNewOp<arith::ConstantOp>(op, loadType,
rw.getZeroAttr(loadType));
return success();
}
};
template <typename OpType>
struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> {
using OpRewritePattern<OpType>::OpRewritePattern;
LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override {
if (!staticallyOutOfBounds(op))
return failure();
rw.eraseOp(op);
return success();
}
};
} // end namespace
void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
results.add<RemoveStaticallyOobBufferLoads>(context);
}
void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
results.add<RemoveStaticallyOobBufferWrites<RawBufferStoreOp>>(context);
}
void RawBufferAtomicFaddOp::getCanonicalizationPatterns(
RewritePatternSet &results, MLIRContext *context) {
results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicFaddOp>>(context);
}
//===----------------------------------------------------------------------===//
// MFMAOp
//===----------------------------------------------------------------------===//

View File

@@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIRAMDGPUDialect
MLIRAMDGPUIncGen
LINK_LIBS PUBLIC
MLIRArithDialect
MLIRIR
MLIRSideEffectInterfaces
)

View File

@@ -0,0 +1,132 @@
// RUN: mlir-opt %s -split-input-file -canonicalize | FileCheck %s
// CHECK-LABEL: func @known_oob_load
func.func @known_oob_load(%arg0: memref<4xf32>) -> f32 {
// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: return %[[zero]]
%c4_i32 = arith.constant 4 : i32
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
func.return %0 : f32
}
// -----
// CHECK-LABEL: func @known_oob_load_2d
func.func @known_oob_load_2d(%arg0: memref<4x4xf32>) -> f32 {
// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: return %[[zero]]
%c0_i32 = arith.constant 0 : i32
%c4_i32 = arith.constant 4 : i32
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32, %c0_i32] : memref<4x4xf32>, i32, i32 -> f32
func.return %0 : f32
}
// -----
// CHECK-LABEL: func @known_oob_load_2d_on_last
func.func @known_oob_load_2d_on_last(%arg0: memref<4x4xf32>) -> f32 {
// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: return %[[zero]]
%c0_i32 = arith.constant 0 : i32
%c16_i32 = arith.constant 16 : i32
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c16_i32] : memref<4x4xf32>, i32, i32 -> f32
func.return %0 : f32
}
// -----
// CHECK-LABEL: func @known_oob_load_index
func.func @known_oob_load_index(%arg0: memref<4xf32>) -> f32 {
// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: return %[[zero]]
%c0_i32 = arith.constant 0 : i32
%0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 4 : i32} %arg0[%c0_i32] : memref<4xf32>, i32 -> f32
func.return %0 : f32
}
// -----
// CHECK-LABEL: func @known_oob_load_sgproffset
func.func @known_oob_load_sgproffset(%arg0: memref<4xf32>) -> f32 {
// CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: return %[[zero]]
%c2_i32 = arith.constant 2 : i32
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c2_i32] sgprOffset %c2_i32 : memref<4xf32>, i32 -> f32
func.return %0 : f32
}
// -----
// CHECK-LABEL: func @unknown_load
func.func @unknown_load(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
// CHECK: return %[[loaded]]
%c4_i32 = arith.constant 4 : i32
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%arg1] sgprOffset %c4_i32 : memref<4xf32>, i32 -> f32
func.return %0 : f32
}
// -----
// CHECK-LABEL: func @unknown_load_sgproffset
func.func @unknown_load_sgproffset(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
// CHECK: return %[[loaded]]
%c4_i32 = arith.constant 4 : i32
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] sgprOffset %arg1 : memref<4xf32>, i32 -> f32
func.return %0 : f32
}
// -----
// CHECK-LABEL: func @unranked
func.func @unranked(%arg0: memref<?xf32>) -> f32 {
// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
// CHECK: return %[[loaded]]
%c4_i32 = arith.constant 4 : i32
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<?xf32>, i32 -> f32
func.return %0 : f32
}
// -----
// CHECK-LABEL: func @no_oob_check
func.func @no_oob_check(%arg0: memref<4xf32>) -> f32 {
// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
// CHECK: return %[[loaded]]
%c4_i32 = arith.constant 4 : i32
%0 = amdgpu.raw_buffer_load {boundsCheck = false} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
func.return %0 : f32
}
// -----
// CHECK-LABEL: func @in_bounds_overall
func.func @in_bounds_overall(%arg0: memref<4x4xf32>) -> f32 {
// CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
// CHECK: return %[[loaded]]
%c0_i32 = arith.constant 0 : i32
%c15_i32 = arith.constant 15 : i32
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c15_i32] : memref<4x4xf32>, i32, i32 -> f32
func.return %0 : f32
}
// -----
// CHECK-LABEL: func @dead_store
func.func @dead_store(%arg0: memref<4xf32>, %arg1: f32) {
// CHECK-NOT: amdgpu.raw_buffer_store
%c4_i32 = arith.constant 4 : i32
amdgpu.raw_buffer_store {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
func.return
}
// -----
// CHECK-LABEL: func @dead_atomic_add
func.func @dead_atomic_add(%arg0: memref<4xf32>, %arg1: f32) {
// CHECK-NOT: amdgpu.raw_buffer_atomic_fadd
%c4_i32 = arith.constant 4 : i32
amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
func.return
}