[mlir][AMDGPU] Remove buffer ops that are statically out of bounds

When the bounds check attribute is true, the raw buffer load, store, and atomic operations have well-defined behavior (returning 0 for loads and ignoring stores) when the buffer access exceeds the bounds of the memory being accessed. Because of how LLVM currently implements these buffer operations (as opaque intrinsics), the backend cannot optimize out this known behavior and eliminate the memory operations. Therefore, use MLIR's canonicalization system to eliminate these operations. Reviewed By: nirvedhmeshram Differential Revision: https://reviews.llvm.org/D138146
2022-11-09 22:26:33 +00:00
parent 3f9d64a2ad
commit d6abdf46bc
4 changed files with 235 additions and 0 deletions
--- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
@@ -23,6 +23,10 @@ def AMDGPU_Dialect : Dialect {
    that will eventually be executed on AMD hardware.
  }];

+
+  let dependentDialects = [
+    "arith::ArithDialect"
+  ];
  let useDefaultAttributePrinterParser = 1;
 }

@@ -83,6 +87,7 @@ def AMDGPU_RawBufferLoadOp :
      (`sgprOffset` $sgprOffset^)? `:`
      type($memref) `,` type($indices) `->` type($value)
  }];
+  let hasCanonicalizer = 1;
  let hasVerifier = 1;
 }

@@ -124,6 +129,7 @@ def AMDGPU_RawBufferStoreOp :
      (`sgprOffset` $sgprOffset^)? `:`
      type($value) `->` type($memref) `,` type($indices)
  }];
+  let hasCanonicalizer = 1;
  let hasVerifier = 1;
 }

@@ -162,6 +168,7 @@ def AMDGPU_RawBufferAtomicFaddOp :
      (`sgprOffset` $sgprOffset^)? `:`
      type($value) `->` type($memref) `,` type($indices)
  }];
+  let hasCanonicalizer = 1;
  let hasVerifier = 1;
 }

--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -12,14 +12,19 @@

 #include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"

+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "llvm/ADT/TypeSwitch.h"

+#include <limits>
+
 using namespace mlir;
 using namespace mlir::amdgpu;

@@ -62,6 +67,96 @@ LogicalResult RawBufferAtomicFaddOp::verify() {
  return verifyRawBufferOp(*this);
 }

+static Optional<uint32_t> getConstantUint32(Value v) {
+  APInt cst;
+  if (!v.getType().isInteger(32))
+    return None;
+  if (matchPattern(v, m_ConstantInt(&cst)))
+    return cst.getZExtValue();
+  return None;
+}
+
+template <typename OpType>
+static bool staticallyOutOfBounds(OpType op) {
+  if (!op.getBoundsCheck())
+    return false;
+  MemRefType bufferType = op.getMemref().getType();
+  if (!bufferType.hasStaticShape())
+    return false;
+  int64_t offset;
+  SmallVector<int64_t> strides;
+  if (failed(getStridesAndOffset(bufferType, strides, offset)))
+    return false;
+  int64_t result = offset + op.getIndexOffset().value_or(0);
+  if (op.getSgprOffset()) {
+    Optional<uint32_t> sgprOffset = getConstantUint32(op.getSgprOffset());
+    if (!sgprOffset)
+      return false;
+    result += *sgprOffset;
+  }
+  if (strides.size() != op.getIndices().size())
+    return false;
+  int64_t indexVal = 0;
+  for (auto pair : llvm::zip(strides, op.getIndices())) {
+    int64_t stride = std::get<0>(pair);
+    Value idx = std::get<1>(pair);
+    Optional<uint32_t> idxVal = getConstantUint32(idx);
+    if (!idxVal)
+      return false;
+    indexVal += stride * idxVal.value();
+  }
+  result += indexVal;
+  if (result > std::numeric_limits<uint32_t>::max())
+    // Overflow means don't drop
+    return false;
+  return result >= bufferType.getNumElements();
+}
+
+namespace {
+struct RemoveStaticallyOobBufferLoads final
+    : public OpRewritePattern<RawBufferLoadOp> {
+  using OpRewritePattern<RawBufferLoadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(RawBufferLoadOp op,
+                                PatternRewriter &rw) const override {
+    if (!staticallyOutOfBounds(op))
+      return failure();
+    Type loadType = op.getResult().getType();
+    rw.replaceOpWithNewOp<arith::ConstantOp>(op, loadType,
+                                             rw.getZeroAttr(loadType));
+    return success();
+  }
+};
+
+template <typename OpType>
+struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> {
+  using OpRewritePattern<OpType>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override {
+    if (!staticallyOutOfBounds(op))
+      return failure();
+
+    rw.eraseOp(op);
+    return success();
+  }
+};
+} // end namespace
+
+void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                  MLIRContext *context) {
+  results.add<RemoveStaticallyOobBufferLoads>(context);
+}
+
+void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                   MLIRContext *context) {
+  results.add<RemoveStaticallyOobBufferWrites<RawBufferStoreOp>>(context);
+}
+
+void RawBufferAtomicFaddOp::getCanonicalizationPatterns(
+    RewritePatternSet &results, MLIRContext *context) {
+  results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicFaddOp>>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // MFMAOp
 //===----------------------------------------------------------------------===//
--- a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIRAMDGPUDialect
  MLIRAMDGPUIncGen

  LINK_LIBS PUBLIC
+  MLIRArithDialect
  MLIRIR
  MLIRSideEffectInterfaces
  )
--- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -0,0 +1,132 @@
+// RUN: mlir-opt %s -split-input-file -canonicalize  | FileCheck %s
+
+// CHECK-LABEL: func @known_oob_load
+func.func @known_oob_load(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_2d
+func.func @known_oob_load_2d(%arg0: memref<4x4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32, %c0_i32] : memref<4x4xf32>, i32, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_2d_on_last
+func.func @known_oob_load_2d_on_last(%arg0: memref<4x4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c0_i32 = arith.constant 0 : i32
+  %c16_i32 = arith.constant 16 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c16_i32] : memref<4x4xf32>, i32, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_index
+func.func @known_oob_load_index(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c0_i32 = arith.constant 0 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 4 : i32} %arg0[%c0_i32] : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_sgproffset
+func.func @known_oob_load_sgproffset(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c2_i32 = arith.constant 2 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c2_i32] sgprOffset %c2_i32 : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unknown_load
+func.func @unknown_load(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%arg1] sgprOffset %c4_i32 : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unknown_load_sgproffset
+func.func @unknown_load_sgproffset(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] sgprOffset %arg1 : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unranked
+func.func @unranked(%arg0: memref<?xf32>) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<?xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @no_oob_check
+func.func @no_oob_check(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = false} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @in_bounds_overall
+func.func @in_bounds_overall(%arg0: memref<4x4xf32>) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c0_i32 = arith.constant 0 : i32
+  %c15_i32 = arith.constant 15 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c15_i32] : memref<4x4xf32>, i32, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @dead_store
+func.func @dead_store(%arg0: memref<4xf32>, %arg1: f32) {
+  // CHECK-NOT: amdgpu.raw_buffer_store
+  %c4_i32 = arith.constant 4 : i32
+  amdgpu.raw_buffer_store {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @dead_atomic_add
+func.func @dead_atomic_add(%arg0: memref<4xf32>, %arg1: f32) {
+  // CHECK-NOT: amdgpu.raw_buffer_atomic_fadd
+  %c4_i32 = arith.constant 4 : i32
+  amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
+  func.return
+}