[mlir:Transforms] Move out the remaining non-dialect independent transforms and utilities
This has been a major TODO for a very long time, and is necessary for establishing a proper dialect-free dependency layering for the Transforms library. Code was moved to effectively two main locations: * Affine/ There was quite a bit of affine dialect related code in Transforms/ do to historical reasons (of a time way into MLIR's past). The following headers were moved to: Transforms/LoopFusionUtils.h -> Dialect/Affine/LoopFusionUtils.h Transforms/LoopUtils.h -> Dialect/Affine/LoopUtils.h Transforms/Utils.h -> Dialect/Affine/Utils.h The following transforms were also moved: AffineLoopFusion, AffinePipelineDataTransfer, LoopCoalescing * SCF/ Only one SCF pass was in Transforms/ (likely accidentally placed here): ParallelLoopCollapsing The SCF specific utilities in LoopUtils have been moved to SCF/Utils.h * Misc: mlir::moveLoopInvariantCode was also moved to LoopLikeInterface.h given that it is a simple utility defined in terms of LoopLikeOpInterface. Differential Revision: https://reviews.llvm.org/D117848
This commit is contained in:
@@ -12,8 +12,8 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
|
||||
#define MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
|
||||
#ifndef MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
|
||||
#define MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
|
||||
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
@@ -167,4 +167,4 @@ void gatherProducerConsumerMemrefs(ArrayRef<Operation *> srcOps,
|
||||
DenseSet<Value> &producerConsumerMemrefs);
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
|
||||
#endif // MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
|
||||
@@ -12,8 +12,8 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef MLIR_TRANSFORMS_LOOPUTILS_H
|
||||
#define MLIR_TRANSFORMS_LOOPUTILS_H
|
||||
#ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H
|
||||
#define MLIR_DIALECT_AFFINE_LOOPUTILS_H
|
||||
|
||||
#include "mlir/IR/Block.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
@@ -45,9 +45,6 @@ LogicalResult loopUnrollFull(AffineForOp forOp);
|
||||
LogicalResult loopUnrollByFactor(
|
||||
AffineForOp forOp, uint64_t unrollFactor,
|
||||
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr);
|
||||
LogicalResult loopUnrollByFactor(
|
||||
scf::ForOp forOp, uint64_t unrollFactor,
|
||||
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr);
|
||||
|
||||
/// Unrolls this loop by the specified unroll factor or its trip count,
|
||||
/// whichever is lower.
|
||||
@@ -63,8 +60,6 @@ bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops);
|
||||
/// AffineForOp, and the second op is a terminator).
|
||||
void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
|
||||
AffineForOp root);
|
||||
void getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
|
||||
scf::ForOp root);
|
||||
|
||||
/// Unrolls and jams this loop by the specified factor. `forOp` can be a loop
|
||||
/// with iteration arguments performing supported reductions and its inner loops
|
||||
@@ -78,10 +73,9 @@ LogicalResult loopUnrollJamByFactor(AffineForOp forOp,
|
||||
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
|
||||
uint64_t unrollJamFactor);
|
||||
|
||||
/// Promotes the loop body of a AffineForOp/scf::ForOp to its containing block
|
||||
/// if the loop was known to have a single iteration.
|
||||
/// Promotes the loop body of a AffineForOp to its containing block if the loop
|
||||
/// was known to have a single iteration.
|
||||
LogicalResult promoteIfSingleIteration(AffineForOp forOp);
|
||||
LogicalResult promoteIfSingleIteration(scf::ForOp forOp);
|
||||
|
||||
/// Promotes all single iteration AffineForOp's in the Function, i.e., moves
|
||||
/// their body into the containing Block.
|
||||
@@ -146,13 +140,9 @@ AffineForOp sinkSequentialLoops(AffineForOp forOp);
|
||||
/// occurrence in `forOps`, under each of the `targets`.
|
||||
/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
|
||||
/// nested immediately under each of `targets`.
|
||||
using Loops = SmallVector<scf::ForOp, 8>;
|
||||
using TileLoops = std::pair<Loops, Loops>;
|
||||
SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
|
||||
ArrayRef<uint64_t> sizes,
|
||||
ArrayRef<AffineForOp> targets);
|
||||
SmallVector<Loops, 8> tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
|
||||
ArrayRef<scf::ForOp> targets);
|
||||
|
||||
/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
|
||||
/// and sinking them, in their order of occurrence in `forOps`, under `target`.
|
||||
@@ -160,15 +150,6 @@ SmallVector<Loops, 8> tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
|
||||
/// `target`.
|
||||
SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
|
||||
ArrayRef<uint64_t> sizes, AffineForOp target);
|
||||
Loops tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
|
||||
scf::ForOp target);
|
||||
|
||||
/// Tile a nest of scf::ForOp loops rooted at `rootForOp` with the given
|
||||
/// (parametric) sizes. Sizes are expected to be strictly positive values at
|
||||
/// runtime. If more sizes than loops are provided, discard the trailing values
|
||||
/// in sizes. Assumes the loop nest is permutable.
|
||||
/// Returns the newly created intra-tile loops.
|
||||
Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes);
|
||||
|
||||
/// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
|
||||
struct AffineCopyOptions {
|
||||
@@ -236,16 +217,6 @@ LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion,
|
||||
const AffineCopyOptions ©Options,
|
||||
CopyGenerateResult &result);
|
||||
|
||||
/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
|
||||
/// parametric tile sizes that the outer loops have a fixed number of iterations
|
||||
/// as defined in `sizes`.
|
||||
TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef<int64_t> sizes);
|
||||
|
||||
/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
|
||||
/// `loops` contains a list of perfectly nested loops with bounds and steps
|
||||
/// independent of any loop induction variable involved in the nest.
|
||||
void coalesceLoops(MutableArrayRef<scf::ForOp> loops);
|
||||
|
||||
/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
|
||||
/// `loops` contains a list of perfectly nested loops outermost to innermost
|
||||
/// that are normalized (step one and lower bound of zero) and with bounds and
|
||||
@@ -254,12 +225,6 @@ void coalesceLoops(MutableArrayRef<scf::ForOp> loops);
|
||||
/// be representable using affine.for.
|
||||
LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops);
|
||||
|
||||
/// Take the ParallelLoop and for each set of dimension indices, combine them
|
||||
/// into a single dimension. combinedDimensions must contain each index into
|
||||
/// loops exactly once.
|
||||
void collapseParallelLoops(scf::ParallelOp loops,
|
||||
ArrayRef<std::vector<unsigned>> combinedDimensions);
|
||||
|
||||
/// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
|
||||
/// size given by `numProcessors`. This is achieved by embedding the SSA values
|
||||
/// corresponding to `processorIds` and `numProcessors` into the bounds and step
|
||||
@@ -321,9 +286,6 @@ LogicalResult
|
||||
separateFullTiles(MutableArrayRef<AffineForOp> nest,
|
||||
SmallVectorImpl<AffineForOp> *fullTileNest = nullptr);
|
||||
|
||||
/// Move loop invariant code out of `looplike`.
|
||||
LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike);
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_TRANSFORMS_LOOPUTILS_H
|
||||
#endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H
|
||||
@@ -21,6 +21,10 @@ namespace mlir {
|
||||
|
||||
class AffineForOp;
|
||||
|
||||
/// Fusion mode to attempt. The default mode `Greedy` does both
|
||||
/// producer-consumer and sibling fusion.
|
||||
enum FusionMode { Greedy, ProducerConsumer, Sibling };
|
||||
|
||||
/// Creates a simplification pass for affine structures (maps and sets). In
|
||||
/// addition, this pass also normalizes memrefs to have the trivial (identity)
|
||||
/// layout map.
|
||||
@@ -53,6 +57,19 @@ std::unique_ptr<OperationPass<FuncOp>> createAffineDataCopyGenerationPass();
|
||||
/// dead allocs.
|
||||
std::unique_ptr<OperationPass<FuncOp>> createAffineScalarReplacementPass();
|
||||
|
||||
/// Creates a pass that transforms perfectly nested loops with independent
|
||||
/// bounds into a single loop.
|
||||
std::unique_ptr<OperationPass<FuncOp>> createLoopCoalescingPass();
|
||||
|
||||
/// Creates a loop fusion pass which fuses loops according to type of fusion
|
||||
/// specified in `fusionMode`. Buffers of size less than or equal to
|
||||
/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`.
|
||||
std::unique_ptr<OperationPass<FuncOp>>
|
||||
createLoopFusionPass(unsigned fastMemorySpace = 0,
|
||||
uint64_t localBufSizeThreshold = 0,
|
||||
bool maximalFusion = false,
|
||||
enum FusionMode fusionMode = FusionMode::Greedy);
|
||||
|
||||
/// Creates a pass to perform tiling on loop nests.
|
||||
std::unique_ptr<OperationPass<FuncOp>>
|
||||
createLoopTilingPass(uint64_t cacheSizeBytes);
|
||||
@@ -76,6 +93,10 @@ std::unique_ptr<OperationPass<FuncOp>> createLoopUnrollPass(
|
||||
std::unique_ptr<OperationPass<FuncOp>>
|
||||
createLoopUnrollAndJamPass(int unrollJamFactor = -1);
|
||||
|
||||
/// Creates a pass to pipeline explicit movement of data across levels of the
|
||||
/// memory hierarchy.
|
||||
std::unique_ptr<OperationPass<FuncOp>> createPipelineDataTransferPass();
|
||||
|
||||
/// Creates a pass to vectorize loops, operations and data types using a
|
||||
/// target-independent, n-D super-vector abstraction.
|
||||
std::unique_ptr<OperationPass<FuncOp>>
|
||||
|
||||
@@ -43,6 +43,138 @@ def AffineDataCopyGeneration : Pass<"affine-data-copy-generate", "FuncOp"> {
|
||||
];
|
||||
}
|
||||
|
||||
def AffineLoopFusion : Pass<"affine-loop-fusion", "FuncOp"> {
|
||||
let summary = "Fuse affine loop nests";
|
||||
let description = [{
|
||||
This pass performs fusion of loop nests using a slicing-based approach. It
|
||||
combines two fusion strategies: producer-consumer fusion and sibling fusion.
|
||||
Producer-consumer fusion is aimed at fusing pairs of loops where the first
|
||||
one writes to a memref that the second reads. Sibling fusion targets pairs
|
||||
of loops that share no dependences between them but that load from the same
|
||||
memref. The fused loop nests, when possible, are rewritten to access
|
||||
significantly smaller local buffers instead of the original memref's, and
|
||||
the latter are often either completely optimized away or contracted. This
|
||||
transformation leads to enhanced locality and lower memory footprint through
|
||||
the elimination or contraction of temporaries/intermediate memref's. These
|
||||
benefits are sometimes achieved at the expense of redundant computation
|
||||
through a cost model that evaluates available choices such as the depth at
|
||||
which a source slice should be materialized in the designation slice.
|
||||
|
||||
Example 1: Producer-consumer fusion.
|
||||
Input:
|
||||
```mlir
|
||||
func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
|
||||
%0 = memref.alloc() : memref<10xf32>
|
||||
%1 = memref.alloc() : memref<10xf32>
|
||||
%cst = arith.constant 0.000000e+00 : f32
|
||||
affine.for %arg2 = 0 to 10 {
|
||||
affine.store %cst, %0[%arg2] : memref<10xf32>
|
||||
affine.store %cst, %1[%arg2] : memref<10xf32>
|
||||
}
|
||||
affine.for %arg2 = 0 to 10 {
|
||||
%2 = affine.load %0[%arg2] : memref<10xf32>
|
||||
%3 = arith.addf %2, %2 : f32
|
||||
affine.store %3, %arg0[%arg2] : memref<10xf32>
|
||||
}
|
||||
affine.for %arg2 = 0 to 10 {
|
||||
%2 = affine.load %1[%arg2] : memref<10xf32>
|
||||
%3 = arith.mulf %2, %2 : f32
|
||||
affine.store %3, %arg1[%arg2] : memref<10xf32>
|
||||
}
|
||||
return
|
||||
}
|
||||
```
|
||||
Output:
|
||||
```mlir
|
||||
func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
|
||||
%0 = memref.alloc() : memref<1xf32>
|
||||
%1 = memref.alloc() : memref<1xf32>
|
||||
%cst = arith.constant 0.000000e+00 : f32
|
||||
affine.for %arg2 = 0 to 10 {
|
||||
affine.store %cst, %0[0] : memref<1xf32>
|
||||
affine.store %cst, %1[0] : memref<1xf32>
|
||||
%2 = affine.load %1[0] : memref<1xf32>
|
||||
%3 = arith.mulf %2, %2 : f32
|
||||
affine.store %3, %arg1[%arg2] : memref<10xf32>
|
||||
%4 = affine.load %0[0] : memref<1xf32>
|
||||
%5 = arith.addf %4, %4 : f32
|
||||
affine.store %5, %arg0[%arg2] : memref<10xf32>
|
||||
}
|
||||
return
|
||||
}
|
||||
```
|
||||
|
||||
Example 2: Sibling fusion.
|
||||
Input:
|
||||
```mlir
|
||||
func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
|
||||
%arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
|
||||
%arg4: memref<10x10xf32>) {
|
||||
affine.for %arg5 = 0 to 3 {
|
||||
affine.for %arg6 = 0 to 3 {
|
||||
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
|
||||
%1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
|
||||
%2 = arith.mulf %0, %1 : f32
|
||||
affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
|
||||
}
|
||||
}
|
||||
affine.for %arg5 = 0 to 3 {
|
||||
affine.for %arg6 = 0 to 3 {
|
||||
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
|
||||
%1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
|
||||
%2 = arith.addf %0, %1 : f32
|
||||
affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32>
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
```
|
||||
Output:
|
||||
```mlir
|
||||
func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
|
||||
%arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
|
||||
%arg4: memref<10x10xf32>) {
|
||||
affine.for %arg5 = 0 to 3 {
|
||||
affine.for %arg6 = 0 to 3 {
|
||||
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
|
||||
%1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
|
||||
%2 = arith.mulf %0, %1 : f32
|
||||
affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
|
||||
%3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
|
||||
%4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
|
||||
%5 = arith.addf %3, %4 : f32
|
||||
affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32>
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
```
|
||||
}];
|
||||
let constructor = "mlir::createLoopFusionPass()";
|
||||
let options = [
|
||||
Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",
|
||||
/*default=*/"0.30f", "Fractional increase in additional computation "
|
||||
"tolerated while fusing">,
|
||||
Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned",
|
||||
/*default=*/"0",
|
||||
"Faster memory space number to promote fusion buffers to">,
|
||||
Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t",
|
||||
/*default=*/"0", "Threshold size (KiB) for promoting local buffers "
|
||||
"to fast memory space">,
|
||||
Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
|
||||
"Enables maximal loop fusion">,
|
||||
Option<"affineFusionMode", "mode", "enum FusionMode",
|
||||
"mlir::FusionMode::Greedy", "fusion mode to attempt",
|
||||
"llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy,"
|
||||
" \"greedy\", \"Perform greedy (both producer-consumer and sibling) fusion\"), "
|
||||
"clEnumValN( mlir::FusionMode::ProducerConsumer, "
|
||||
"\"producer\", \"Perform only producer-consumer fusion\"), "
|
||||
"clEnumValN( mlir::FusionMode::Sibling, "
|
||||
"\"sibling\", \"Perform only sibling fusion\"))">,
|
||||
];
|
||||
let dependentDialects = ["memref::MemRefDialect"];
|
||||
}
|
||||
|
||||
def AffineLoopInvariantCodeMotion
|
||||
: Pass<"affine-loop-invariant-code-motion", "FuncOp"> {
|
||||
let summary = "Hoist loop invariant instructions outside of affine loops";
|
||||
@@ -94,6 +226,75 @@ def AffineLoopUnrollAndJam : Pass<"affine-loop-unroll-jam", "FuncOp"> {
|
||||
];
|
||||
}
|
||||
|
||||
def AffinePipelineDataTransfer
|
||||
: Pass<"affine-pipeline-data-transfer", "FuncOp"> {
|
||||
let summary = "Pipeline non-blocking data transfers between explicitly "
|
||||
"managed levels of the memory hierarchy";
|
||||
let description = [{
|
||||
This pass performs a transformation to overlap non-blocking DMA operations
|
||||
in a loop with computations through double buffering. This is achieved by
|
||||
advancing dma_start operations with respect to other operations.
|
||||
|
||||
Input
|
||||
|
||||
```mlir
|
||||
func @pipelinedatatransfer() {
|
||||
%0 = memref.alloc() : memref<256xf32>
|
||||
%1 = memref.alloc() : memref<32xf32, 1>
|
||||
%2 = memref.alloc() : memref<1xf32>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c128 = arith.constant 128 : index
|
||||
affine.for %i0 = 0 to 8 {
|
||||
affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
|
||||
affine.dma_wait %2[%c0], %c128 : memref<1xf32>
|
||||
%3 = affine.load %1[%i0] : memref<32xf32, 1>
|
||||
%4 = "compute"(%3) : (f32) -> f32
|
||||
affine.store %4, %1[%i0] : memref<32xf32, 1>
|
||||
}
|
||||
return
|
||||
}
|
||||
```
|
||||
|
||||
Output
|
||||
|
||||
```mlir
|
||||
module {
|
||||
func @pipelinedatatransfer() {
|
||||
%c8 = arith.constant 8 : index
|
||||
%c0 = arith.constant 0 : index
|
||||
%0 = memref.alloc() : memref<256xf32>
|
||||
%c0_0 = arith.constant 0 : index
|
||||
%c128 = arith.constant 128 : index
|
||||
%1 = memref.alloc() : memref<2x32xf32, 1>
|
||||
%2 = memref.alloc() : memref<2x1xf32>
|
||||
affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
|
||||
affine.for %arg0 = 1 to 8 {
|
||||
affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
|
||||
%8 = affine.apply #map3(%arg0)
|
||||
%9 = affine.apply #map4(%8)
|
||||
%10 = affine.apply #map4(%8)
|
||||
affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
|
||||
%11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
|
||||
%12 = "compute"(%11) : (f32) -> f32
|
||||
affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
|
||||
}
|
||||
%3 = affine.apply #map3(%c8)
|
||||
%4 = affine.apply #map4(%3)
|
||||
%5 = affine.apply #map4(%3)
|
||||
affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
|
||||
%6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
|
||||
%7 = "compute"(%6) : (f32) -> f32
|
||||
affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
|
||||
memref.dealloc %2 : memref<2x1xf32>
|
||||
memref.dealloc %1 : memref<2x32xf32, 1>
|
||||
return
|
||||
}
|
||||
}
|
||||
```
|
||||
}];
|
||||
let constructor = "mlir::createPipelineDataTransferPass()";
|
||||
}
|
||||
|
||||
def AffineScalarReplacement : Pass<"affine-scalrep", "FuncOp"> {
|
||||
let summary = "Replace affine memref acceses by scalars by forwarding stores "
|
||||
"to loads and eliminating redundant loads";
|
||||
@@ -184,6 +385,13 @@ def AffineLoopNormalize : Pass<"affine-loop-normalize", "FuncOp"> {
|
||||
let constructor = "mlir::createAffineLoopNormalizePass()";
|
||||
}
|
||||
|
||||
def LoopCoalescing : Pass<"loop-coalescing", "FuncOp"> {
|
||||
let summary = "Coalesce nested loops with independent bounds into a single "
|
||||
"loop";
|
||||
let constructor = "mlir::createLoopCoalescingPass()";
|
||||
let dependentDialects = ["arith::ArithmeticDialect"];
|
||||
}
|
||||
|
||||
def SimplifyAffineStructures : Pass<"simplify-affine-structures", "FuncOp"> {
|
||||
let summary = "Simplify affine expressions in maps/sets and normalize "
|
||||
"memrefs";
|
||||
|
||||
@@ -24,6 +24,10 @@ class DominanceInfo;
|
||||
class Operation;
|
||||
class PostDominanceInfo;
|
||||
|
||||
namespace memref {
|
||||
class AllocOp;
|
||||
} // namespace memref
|
||||
|
||||
struct LogicalResult;
|
||||
|
||||
using ReductionLoopMap = DenseMap<Operation *, SmallVector<LoopReduction, 2>>;
|
||||
@@ -168,6 +172,121 @@ void normalizeAffineFor(AffineForOp op);
|
||||
AffineExpr substWithMin(AffineExpr e, AffineExpr dim, AffineExpr min,
|
||||
AffineExpr max, bool positivePath = true);
|
||||
|
||||
/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
|
||||
/// optionally remapping the old memref's indices using the supplied affine map,
|
||||
/// `indexRemap`. The new memref could be of a different shape or rank.
|
||||
/// `extraIndices` provides any additional access indices to be added to the
|
||||
/// start.
|
||||
///
|
||||
/// `indexRemap` remaps indices of the old memref access to a new set of indices
|
||||
/// that are used to index the memref. Additional input operands to indexRemap
|
||||
/// can be optionally provided in `extraOperands`, and they occupy the start
|
||||
/// of its input list. `indexRemap`'s dimensional inputs are expected to
|
||||
/// correspond to memref's indices, and its symbolic inputs if any should be
|
||||
/// provided in `symbolOperands`.
|
||||
///
|
||||
/// `domOpFilter`, if non-null, restricts the replacement to only those
|
||||
/// operations that are dominated by the former; similarly, `postDomOpFilter`
|
||||
/// restricts replacement to only those operations that are postdominated by it.
|
||||
///
|
||||
/// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing
|
||||
/// uses of a memref without any requirement for access index rewrites as long
|
||||
/// as the user operation has the MemRefsNormalizable trait. The default value
|
||||
/// of this flag is false.
|
||||
///
|
||||
/// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to
|
||||
/// also be a candidate for replacement. The default value of this flag is
|
||||
/// false.
|
||||
///
|
||||
/// Returns true on success and false if the replacement is not possible,
|
||||
/// whenever a memref is used as an operand in a non-dereferencing context and
|
||||
/// 'allowNonDereferencingOps' is false, except for dealloc's on the memref
|
||||
/// which are left untouched. See comments at function definition for an
|
||||
/// example.
|
||||
//
|
||||
// Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
|
||||
// The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
|
||||
// index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
|
||||
// d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
|
||||
// extra operands, note that 'indexRemap' would just be applied to existing
|
||||
// indices (%i, %j).
|
||||
// TODO: allow extraIndices to be added at any position.
|
||||
LogicalResult replaceAllMemRefUsesWith(
|
||||
Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices = {},
|
||||
AffineMap indexRemap = AffineMap(), ArrayRef<Value> extraOperands = {},
|
||||
ArrayRef<Value> symbolOperands = {}, Operation *domOpFilter = nullptr,
|
||||
Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false,
|
||||
bool replaceInDeallocOp = false);
|
||||
|
||||
/// Performs the same replacement as the other version above but only for the
|
||||
/// dereferencing uses of `oldMemRef` in `op`, except in cases where
|
||||
/// 'allowNonDereferencingOps' is set to true where we replace the
|
||||
/// non-dereferencing uses as well.
|
||||
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
|
||||
Operation *op,
|
||||
ArrayRef<Value> extraIndices = {},
|
||||
AffineMap indexRemap = AffineMap(),
|
||||
ArrayRef<Value> extraOperands = {},
|
||||
ArrayRef<Value> symbolOperands = {},
|
||||
bool allowNonDereferencingOps = false);
|
||||
|
||||
/// Rewrites the memref defined by this alloc op to have an identity layout map
|
||||
/// and updates all its indexing uses. Returns failure if any of its uses
|
||||
/// escape (while leaving the IR in a valid state).
|
||||
LogicalResult normalizeMemRef(memref::AllocOp *op);
|
||||
|
||||
/// Uses the old memref type map layout and computes the new memref type to have
|
||||
/// a new shape and a layout map, where the old layout map has been normalized
|
||||
/// to an identity layout map. It returns the old memref in case no
|
||||
/// normalization was needed or a failure occurs while transforming the old map
|
||||
/// layout to an identity layout map.
|
||||
MemRefType normalizeMemRefType(MemRefType memrefType, OpBuilder builder,
|
||||
unsigned numSymbolicOperands);
|
||||
|
||||
/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
|
||||
/// its results equal to the number of operands, as a composition
|
||||
/// of all other AffineApplyOps reachable from input parameter 'operands'. If
|
||||
/// different operands were drawing results from multiple affine apply ops,
|
||||
/// these will also be collected into a single (multi-result) affine apply op.
|
||||
/// The final results of the composed AffineApplyOp are returned in output
|
||||
/// parameter 'results'. Returns the affine apply op created.
|
||||
Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc,
|
||||
ArrayRef<Value> operands,
|
||||
ArrayRef<Operation *> affineApplyOps,
|
||||
SmallVectorImpl<Value> *results);
|
||||
|
||||
/// Given an operation, inserts one or more single result affine apply
|
||||
/// operations, results of which are exclusively used by this operation.
|
||||
/// The operands of these newly created affine apply ops are
|
||||
/// guaranteed to be loop iterators or terminal symbols of a function.
|
||||
///
|
||||
/// Before
|
||||
///
|
||||
/// affine.for %i = 0 to #map(%N)
|
||||
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// send %A[%idx], ...
|
||||
/// %v = "compute"(%idx, ...)
|
||||
///
|
||||
/// After
|
||||
///
|
||||
/// affine.for %i = 0 to #map(%N)
|
||||
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// send %A[%idx], ...
|
||||
/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// %v = "compute"(%idx_, ...)
|
||||
|
||||
/// This allows the application of different transformations on send and
|
||||
/// compute (for eg. different shifts/delays)
|
||||
///
|
||||
/// Fills `sliceOps` with the list of affine.apply operations.
|
||||
/// In the following cases, `sliceOps` remains empty:
|
||||
/// 1. If none of opInst's operands were the result of an affine.apply
|
||||
/// (i.e., there was no affine computation slice to create).
|
||||
/// 2. If all the affine.apply op's supplying operands to this opInst did not
|
||||
/// have any uses other than those in this opInst.
|
||||
void createAffineComputationSlice(Operation *opInst,
|
||||
SmallVectorImpl<AffineApplyOp> *sliceOps);
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_DIALECT_AFFINE_UTILS_H
|
||||
|
||||
@@ -32,6 +32,10 @@ std::unique_ptr<Pass> createForLoopPeelingPass();
|
||||
/// inside of scf.for loops with known lower and upper bounds.
|
||||
std::unique_ptr<Pass> createSCFForLoopCanonicalizationPass();
|
||||
|
||||
/// Creates a pass that transforms a single ParallelLoop over N induction
|
||||
/// variables into another ParallelLoop over less than N induction variables.
|
||||
std::unique_ptr<Pass> createParallelLoopCollapsingPass();
|
||||
|
||||
/// Creates a loop fusion pass which fuses parallel loops.
|
||||
std::unique_ptr<Pass> createParallelLoopFusionPass();
|
||||
|
||||
|
||||
@@ -52,6 +52,22 @@ def SCFParallelLoopFusion : Pass<"parallel-loop-fusion"> {
|
||||
let constructor = "mlir::createParallelLoopFusionPass()";
|
||||
}
|
||||
|
||||
def SCFParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> {
|
||||
let summary = "Collapse parallel loops to use less induction variables";
|
||||
let constructor = "mlir::createParallelLoopCollapsingPass()";
|
||||
let options = [
|
||||
ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned",
|
||||
"Which loop indices to combine 0th loop index",
|
||||
"llvm::cl::MiscFlags::CommaSeparated">,
|
||||
ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned",
|
||||
"Which loop indices to combine into the position 1 loop index",
|
||||
"llvm::cl::MiscFlags::CommaSeparated">,
|
||||
ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned",
|
||||
"Which loop indices to combine into the position 2 loop index",
|
||||
"llvm::cl::MiscFlags::CommaSeparated">,
|
||||
];
|
||||
}
|
||||
|
||||
def SCFParallelLoopSpecialization
|
||||
: Pass<"parallel-loop-specialization", "FuncOp"> {
|
||||
let summary = "Specialize parallel loops for vectorization";
|
||||
|
||||
@@ -98,5 +98,65 @@ getSCFMinMaxExpr(Value value, SmallVectorImpl<Value> &dims,
|
||||
SmallVectorImpl<Value> &symbols,
|
||||
llvm::function_ref<bool(Operation *)> loopFilter = nullptr);
|
||||
|
||||
/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
|
||||
/// `loops` contains a list of perfectly nested loops with bounds and steps
|
||||
/// independent of any loop induction variable involved in the nest.
|
||||
void coalesceLoops(MutableArrayRef<scf::ForOp> loops);
|
||||
|
||||
/// Take the ParallelLoop and for each set of dimension indices, combine them
|
||||
/// into a single dimension. combinedDimensions must contain each index into
|
||||
/// loops exactly once.
|
||||
void collapseParallelLoops(scf::ParallelOp loops,
|
||||
ArrayRef<std::vector<unsigned>> combinedDimensions);
|
||||
|
||||
/// Promotes the loop body of a scf::ForOp to its containing block if the loop
|
||||
/// was known to have a single iteration.
|
||||
LogicalResult promoteIfSingleIteration(scf::ForOp forOp);
|
||||
|
||||
/// Unrolls this for operation by the specified unroll factor. Returns failure
|
||||
/// if the loop cannot be unrolled either due to restrictions or due to invalid
|
||||
/// unroll factors. Requires positive loop bounds and step. If specified,
|
||||
/// annotates the Ops in each unrolled iteration by applying `annotateFn`.
|
||||
LogicalResult loopUnrollByFactor(
|
||||
scf::ForOp forOp, uint64_t unrollFactor,
|
||||
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr);
|
||||
|
||||
/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
|
||||
/// parametric tile sizes that the outer loops have a fixed number of iterations
|
||||
/// as defined in `sizes`.
|
||||
using Loops = SmallVector<scf::ForOp, 8>;
|
||||
using TileLoops = std::pair<Loops, Loops>;
|
||||
TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef<int64_t> sizes);
|
||||
|
||||
/// Performs tiling fo imperfectly nested loops (with interchange) by
|
||||
/// strip-mining the `forOps` by `sizes` and sinking them, in their order of
|
||||
/// occurrence in `forOps`, under each of the `targets`.
|
||||
/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
|
||||
/// nested immediately under each of `targets`.
|
||||
SmallVector<Loops, 8> tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
|
||||
ArrayRef<scf::ForOp> targets);
|
||||
|
||||
/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
|
||||
/// and sinking them, in their order of occurrence in `forOps`, under `target`.
|
||||
/// Returns the new AffineForOps, one per `forOps`, nested immediately under
|
||||
/// `target`.
|
||||
Loops tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
|
||||
scf::ForOp target);
|
||||
|
||||
/// Tile a nest of scf::ForOp loops rooted at `rootForOp` with the given
|
||||
/// (parametric) sizes. Sizes are expected to be strictly positive values at
|
||||
/// runtime. If more sizes than loops are provided, discard the trailing values
|
||||
/// in sizes. Assumes the loop nest is permutable.
|
||||
/// Returns the newly created intra-tile loops.
|
||||
Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes);
|
||||
|
||||
/// Get perfectly nested sequence of loops starting at root of loop nest
|
||||
/// (the first op being another AffineFor, and the second op - a terminator).
|
||||
/// A loop is perfectly nested iff: the first op in the loop's body is another
|
||||
/// AffineForOp, and the second op is a terminator).
|
||||
void getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
|
||||
scf::ForOp root);
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_DIALECT_SCF_UTILS_H_
|
||||
|
||||
@@ -15,7 +15,20 @@
|
||||
|
||||
#include "mlir/IR/OpDefinition.h"
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// LoopLike Interfaces
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// Include the generated interface declarations.
|
||||
#include "mlir/Interfaces/LoopLikeInterface.h.inc"
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// LoopLike Utilities
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
namespace mlir {
|
||||
/// Move loop invariant code out of a `looplike` operation.
|
||||
LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike);
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_INTERFACES_LOOPLIKEINTERFACE_H_
|
||||
|
||||
70
mlir/include/mlir/Transforms/ControlFlowSinkUtils.h
Normal file
70
mlir/include/mlir/Transforms/ControlFlowSinkUtils.h
Normal file
@@ -0,0 +1,70 @@
|
||||
//===- ControlFlowSinkUtils.h - ControlFlow Sink Utils ----------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H
|
||||
#define MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H
|
||||
|
||||
#include "mlir/Support/LLVM.h"
|
||||
|
||||
namespace mlir {
|
||||
|
||||
class DominanceInfo;
|
||||
class Operation;
|
||||
class Region;
|
||||
class RegionBranchOpInterface;
|
||||
|
||||
/// Given a list of regions, perform control flow sinking on them. For each
|
||||
/// region, control-flow sinking moves operations that dominate the region but
|
||||
/// whose only users are in the region into the regions so that they aren't
|
||||
/// executed on paths where their results are not needed.
|
||||
///
|
||||
/// TODO: For the moment, this is a *simple* control-flow sink, i.e., no
|
||||
/// duplicating of ops. It should be made to accept a cost model to determine
|
||||
/// whether duplicating a particular op is profitable.
|
||||
///
|
||||
/// Example:
|
||||
///
|
||||
/// ```mlir
|
||||
/// %0 = arith.addi %arg0, %arg1
|
||||
/// scf.if %cond {
|
||||
/// scf.yield %0
|
||||
/// } else {
|
||||
/// scf.yield %arg2
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// After control-flow sink:
|
||||
///
|
||||
/// ```mlir
|
||||
/// scf.if %cond {
|
||||
/// %0 = arith.addi %arg0, %arg1
|
||||
/// scf.yield %0
|
||||
/// } else {
|
||||
/// scf.yield %arg2
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// Users must supply a callback `shouldMoveIntoRegion` that determines whether
|
||||
/// the given operation that only has users in the given operation should be
|
||||
/// moved into that region.
|
||||
///
|
||||
/// Returns the number of operations sunk.
|
||||
size_t
|
||||
controlFlowSink(ArrayRef<Region *> regions, DominanceInfo &domInfo,
|
||||
function_ref<bool(Operation *, Region *)> shouldMoveIntoRegion);
|
||||
|
||||
/// Populates `regions` with regions of the provided region branch op that are
|
||||
/// executed at most once at that are reachable given the current operands of
|
||||
/// the op. These regions can be passed to `controlFlowSink` to perform sinking
|
||||
/// on the regions of the operation.
|
||||
void getSinglyExecutedRegionsToSink(RegionBranchOpInterface branch,
|
||||
SmallVectorImpl<Region *> ®ions);
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H
|
||||
@@ -22,13 +22,8 @@
|
||||
|
||||
namespace mlir {
|
||||
|
||||
class AffineForOp;
|
||||
class GreedyRewriteConfig;
|
||||
|
||||
/// Fusion mode to attempt. The default mode `Greedy` does both
|
||||
/// producer-consumer and sibling fusion.
|
||||
enum FusionMode { Greedy, ProducerConsumer, Sibling };
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Passes
|
||||
//===----------------------------------------------------------------------===//
|
||||
@@ -56,31 +51,10 @@ std::unique_ptr<Pass> createControlFlowSinkPass();
|
||||
/// Creates a pass to perform common sub expression elimination.
|
||||
std::unique_ptr<Pass> createCSEPass();
|
||||
|
||||
/// Creates a loop fusion pass which fuses loops according to type of fusion
|
||||
/// specified in `fusionMode`. Buffers of size less than or equal to
|
||||
/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`.
|
||||
std::unique_ptr<OperationPass<FuncOp>>
|
||||
createLoopFusionPass(unsigned fastMemorySpace = 0,
|
||||
uint64_t localBufSizeThreshold = 0,
|
||||
bool maximalFusion = false,
|
||||
enum FusionMode fusionMode = FusionMode::Greedy);
|
||||
|
||||
/// Creates a loop invariant code motion pass that hoists loop invariant
|
||||
/// instructions out of the loop.
|
||||
std::unique_ptr<Pass> createLoopInvariantCodeMotionPass();
|
||||
|
||||
/// Creates a pass to pipeline explicit movement of data across levels of the
|
||||
/// memory hierarchy.
|
||||
std::unique_ptr<OperationPass<FuncOp>> createPipelineDataTransferPass();
|
||||
|
||||
/// Creates a pass that transforms perfectly nested loops with independent
|
||||
/// bounds into a single loop.
|
||||
std::unique_ptr<OperationPass<FuncOp>> createLoopCoalescingPass();
|
||||
|
||||
/// Creates a pass that transforms a single ParallelLoop over N induction
|
||||
/// variables into another ParallelLoop over less than N induction variables.
|
||||
std::unique_ptr<Pass> createParallelLoopCollapsingPass();
|
||||
|
||||
/// Creates a pass to strip debug information from a function.
|
||||
std::unique_ptr<Pass> createStripDebugInfoPass();
|
||||
|
||||
|
||||
@@ -16,207 +16,6 @@
|
||||
include "mlir/Pass/PassBase.td"
|
||||
include "mlir/Rewrite/PassUtil.td"
|
||||
|
||||
def AffineLoopFusion : Pass<"affine-loop-fusion", "FuncOp"> {
|
||||
let summary = "Fuse affine loop nests";
|
||||
let description = [{
|
||||
This pass performs fusion of loop nests using a slicing-based approach. It
|
||||
combines two fusion strategies: producer-consumer fusion and sibling fusion.
|
||||
Producer-consumer fusion is aimed at fusing pairs of loops where the first
|
||||
one writes to a memref that the second reads. Sibling fusion targets pairs
|
||||
of loops that share no dependences between them but that load from the same
|
||||
memref. The fused loop nests, when possible, are rewritten to access
|
||||
significantly smaller local buffers instead of the original memref's, and
|
||||
the latter are often either completely optimized away or contracted. This
|
||||
transformation leads to enhanced locality and lower memory footprint through
|
||||
the elimination or contraction of temporaries/intermediate memref's. These
|
||||
benefits are sometimes achieved at the expense of redundant computation
|
||||
through a cost model that evaluates available choices such as the depth at
|
||||
which a source slice should be materialized in the designation slice.
|
||||
|
||||
Example 1: Producer-consumer fusion.
|
||||
Input:
|
||||
```mlir
|
||||
func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
|
||||
%0 = memref.alloc() : memref<10xf32>
|
||||
%1 = memref.alloc() : memref<10xf32>
|
||||
%cst = arith.constant 0.000000e+00 : f32
|
||||
affine.for %arg2 = 0 to 10 {
|
||||
affine.store %cst, %0[%arg2] : memref<10xf32>
|
||||
affine.store %cst, %1[%arg2] : memref<10xf32>
|
||||
}
|
||||
affine.for %arg2 = 0 to 10 {
|
||||
%2 = affine.load %0[%arg2] : memref<10xf32>
|
||||
%3 = arith.addf %2, %2 : f32
|
||||
affine.store %3, %arg0[%arg2] : memref<10xf32>
|
||||
}
|
||||
affine.for %arg2 = 0 to 10 {
|
||||
%2 = affine.load %1[%arg2] : memref<10xf32>
|
||||
%3 = arith.mulf %2, %2 : f32
|
||||
affine.store %3, %arg1[%arg2] : memref<10xf32>
|
||||
}
|
||||
return
|
||||
}
|
||||
```
|
||||
Output:
|
||||
```mlir
|
||||
func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
|
||||
%0 = memref.alloc() : memref<1xf32>
|
||||
%1 = memref.alloc() : memref<1xf32>
|
||||
%cst = arith.constant 0.000000e+00 : f32
|
||||
affine.for %arg2 = 0 to 10 {
|
||||
affine.store %cst, %0[0] : memref<1xf32>
|
||||
affine.store %cst, %1[0] : memref<1xf32>
|
||||
%2 = affine.load %1[0] : memref<1xf32>
|
||||
%3 = arith.mulf %2, %2 : f32
|
||||
affine.store %3, %arg1[%arg2] : memref<10xf32>
|
||||
%4 = affine.load %0[0] : memref<1xf32>
|
||||
%5 = arith.addf %4, %4 : f32
|
||||
affine.store %5, %arg0[%arg2] : memref<10xf32>
|
||||
}
|
||||
return
|
||||
}
|
||||
```
|
||||
|
||||
Example 2: Sibling fusion.
|
||||
Input:
|
||||
```mlir
|
||||
func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
|
||||
%arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
|
||||
%arg4: memref<10x10xf32>) {
|
||||
affine.for %arg5 = 0 to 3 {
|
||||
affine.for %arg6 = 0 to 3 {
|
||||
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
|
||||
%1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
|
||||
%2 = arith.mulf %0, %1 : f32
|
||||
affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
|
||||
}
|
||||
}
|
||||
affine.for %arg5 = 0 to 3 {
|
||||
affine.for %arg6 = 0 to 3 {
|
||||
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
|
||||
%1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
|
||||
%2 = arith.addf %0, %1 : f32
|
||||
affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32>
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
```
|
||||
Output:
|
||||
```mlir
|
||||
func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
|
||||
%arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
|
||||
%arg4: memref<10x10xf32>) {
|
||||
affine.for %arg5 = 0 to 3 {
|
||||
affine.for %arg6 = 0 to 3 {
|
||||
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
|
||||
%1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
|
||||
%2 = arith.mulf %0, %1 : f32
|
||||
affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
|
||||
%3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
|
||||
%4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
|
||||
%5 = arith.addf %3, %4 : f32
|
||||
affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32>
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
```
|
||||
}];
|
||||
let constructor = "mlir::createLoopFusionPass()";
|
||||
let options = [
|
||||
Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",
|
||||
/*default=*/"0.30f", "Fractional increase in additional computation "
|
||||
"tolerated while fusing">,
|
||||
Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned",
|
||||
/*default=*/"0",
|
||||
"Faster memory space number to promote fusion buffers to">,
|
||||
Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t",
|
||||
/*default=*/"0", "Threshold size (KiB) for promoting local buffers "
|
||||
"to fast memory space">,
|
||||
Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
|
||||
"Enables maximal loop fusion">,
|
||||
Option<"affineFusionMode", "mode", "enum FusionMode",
|
||||
"mlir::FusionMode::Greedy", "fusion mode to attempt",
|
||||
"llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy,"
|
||||
" \"greedy\", \"Perform greedy (both producer-consumer and sibling) fusion\"), "
|
||||
"clEnumValN( mlir::FusionMode::ProducerConsumer, "
|
||||
"\"producer\", \"Perform only producer-consumer fusion\"), "
|
||||
"clEnumValN( mlir::FusionMode::Sibling, "
|
||||
"\"sibling\", \"Perform only sibling fusion\"))">,
|
||||
];
|
||||
let dependentDialects = ["memref::MemRefDialect"];
|
||||
}
|
||||
|
||||
def AffinePipelineDataTransfer
|
||||
: Pass<"affine-pipeline-data-transfer", "FuncOp"> {
|
||||
let summary = "Pipeline non-blocking data transfers between explicitly "
|
||||
"managed levels of the memory hierarchy";
|
||||
let description = [{
|
||||
This pass performs a transformation to overlap non-blocking DMA operations
|
||||
in a loop with computations through double buffering. This is achieved by
|
||||
advancing dma_start operations with respect to other operations.
|
||||
|
||||
Input
|
||||
|
||||
```mlir
|
||||
func @pipelinedatatransfer() {
|
||||
%0 = memref.alloc() : memref<256xf32>
|
||||
%1 = memref.alloc() : memref<32xf32, 1>
|
||||
%2 = memref.alloc() : memref<1xf32>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c128 = arith.constant 128 : index
|
||||
affine.for %i0 = 0 to 8 {
|
||||
affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
|
||||
affine.dma_wait %2[%c0], %c128 : memref<1xf32>
|
||||
%3 = affine.load %1[%i0] : memref<32xf32, 1>
|
||||
%4 = "compute"(%3) : (f32) -> f32
|
||||
affine.store %4, %1[%i0] : memref<32xf32, 1>
|
||||
}
|
||||
return
|
||||
}
|
||||
```
|
||||
|
||||
Output
|
||||
|
||||
```mlir
|
||||
module {
|
||||
func @pipelinedatatransfer() {
|
||||
%c8 = arith.constant 8 : index
|
||||
%c0 = arith.constant 0 : index
|
||||
%0 = memref.alloc() : memref<256xf32>
|
||||
%c0_0 = arith.constant 0 : index
|
||||
%c128 = arith.constant 128 : index
|
||||
%1 = memref.alloc() : memref<2x32xf32, 1>
|
||||
%2 = memref.alloc() : memref<2x1xf32>
|
||||
affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
|
||||
affine.for %arg0 = 1 to 8 {
|
||||
affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
|
||||
%8 = affine.apply #map3(%arg0)
|
||||
%9 = affine.apply #map4(%8)
|
||||
%10 = affine.apply #map4(%8)
|
||||
affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
|
||||
%11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
|
||||
%12 = "compute"(%11) : (f32) -> f32
|
||||
affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
|
||||
}
|
||||
%3 = affine.apply #map3(%c8)
|
||||
%4 = affine.apply #map4(%3)
|
||||
%5 = affine.apply #map4(%3)
|
||||
affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
|
||||
%6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
|
||||
%7 = "compute"(%6) : (f32) -> f32
|
||||
affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
|
||||
memref.dealloc %2 : memref<2x1xf32>
|
||||
memref.dealloc %1 : memref<2x32xf32, 1>
|
||||
return
|
||||
}
|
||||
}
|
||||
```
|
||||
}];
|
||||
let constructor = "mlir::createPipelineDataTransferPass()";
|
||||
}
|
||||
|
||||
def Canonicalizer : Pass<"canonicalize"> {
|
||||
let summary = "Canonicalize operations";
|
||||
let description = [{
|
||||
@@ -339,34 +138,11 @@ def LocationSnapshot : Pass<"snapshot-op-locations"> {
|
||||
];
|
||||
}
|
||||
|
||||
def LoopCoalescing : Pass<"loop-coalescing", "FuncOp"> {
|
||||
let summary = "Coalesce nested loops with independent bounds into a single "
|
||||
"loop";
|
||||
let constructor = "mlir::createLoopCoalescingPass()";
|
||||
let dependentDialects = ["arith::ArithmeticDialect"];
|
||||
}
|
||||
|
||||
def LoopInvariantCodeMotion : Pass<"loop-invariant-code-motion"> {
|
||||
let summary = "Hoist loop invariant instructions outside of the loop";
|
||||
let constructor = "mlir::createLoopInvariantCodeMotionPass()";
|
||||
}
|
||||
|
||||
def ParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> {
|
||||
let summary = "Collapse parallel loops to use less induction variables";
|
||||
let constructor = "mlir::createParallelLoopCollapsingPass()";
|
||||
let options = [
|
||||
ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned",
|
||||
"Which loop indices to combine 0th loop index",
|
||||
"llvm::cl::MiscFlags::CommaSeparated">,
|
||||
ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned",
|
||||
"Which loop indices to combine into the position 1 loop index",
|
||||
"llvm::cl::MiscFlags::CommaSeparated">,
|
||||
ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned",
|
||||
"Which loop indices to combine into the position 2 loop index",
|
||||
"llvm::cl::MiscFlags::CommaSeparated">,
|
||||
];
|
||||
}
|
||||
|
||||
def PrintOpStats : Pass<"print-op-stats"> {
|
||||
let summary = "Print statistics of operations";
|
||||
let constructor = "mlir::createPrintOpStatsPass()";
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
//===- Utils.h - General transformation utilities ---------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This header file defines prototypes for various transformation utilities for
|
||||
// memref's and non-loop IR structures. These are not passes by themselves but
|
||||
// are used either by passes, optimization sequences, or in turn by other
|
||||
// transformation utilities.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef MLIR_TRANSFORMS_UTILS_H
|
||||
#define MLIR_TRANSFORMS_UTILS_H
|
||||
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/IR/AffineMap.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
|
||||
namespace mlir {
|
||||
|
||||
class AffineApplyOp;
|
||||
class AffineForOp;
|
||||
class DominanceInfo;
|
||||
class Location;
|
||||
class OpBuilder;
|
||||
|
||||
namespace memref {
|
||||
class AllocOp;
|
||||
} // namespace memref
|
||||
|
||||
/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
|
||||
/// optionally remapping the old memref's indices using the supplied affine map,
|
||||
/// `indexRemap`. The new memref could be of a different shape or rank.
|
||||
/// `extraIndices` provides any additional access indices to be added to the
|
||||
/// start.
|
||||
///
|
||||
/// `indexRemap` remaps indices of the old memref access to a new set of indices
|
||||
/// that are used to index the memref. Additional input operands to indexRemap
|
||||
/// can be optionally provided in `extraOperands`, and they occupy the start
|
||||
/// of its input list. `indexRemap`'s dimensional inputs are expected to
|
||||
/// correspond to memref's indices, and its symbolic inputs if any should be
|
||||
/// provided in `symbolOperands`.
|
||||
///
|
||||
/// `domOpFilter`, if non-null, restricts the replacement to only those
|
||||
/// operations that are dominated by the former; similarly, `postDomOpFilter`
|
||||
/// restricts replacement to only those operations that are postdominated by it.
|
||||
///
|
||||
/// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing
|
||||
/// uses of a memref without any requirement for access index rewrites as long
|
||||
/// as the user operation has the MemRefsNormalizable trait. The default value
|
||||
/// of this flag is false.
|
||||
///
|
||||
/// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to
|
||||
/// also be a candidate for replacement. The default value of this flag is
|
||||
/// false.
|
||||
///
|
||||
/// Returns true on success and false if the replacement is not possible,
|
||||
/// whenever a memref is used as an operand in a non-dereferencing context and
|
||||
/// 'allowNonDereferencingOps' is false, except for dealloc's on the memref
|
||||
/// which are left untouched. See comments at function definition for an
|
||||
/// example.
|
||||
//
|
||||
// Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
|
||||
// The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
|
||||
// index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
|
||||
// d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
|
||||
// extra operands, note that 'indexRemap' would just be applied to existing
|
||||
// indices (%i, %j).
|
||||
// TODO: allow extraIndices to be added at any position.
|
||||
LogicalResult replaceAllMemRefUsesWith(
|
||||
Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices = {},
|
||||
AffineMap indexRemap = AffineMap(), ArrayRef<Value> extraOperands = {},
|
||||
ArrayRef<Value> symbolOperands = {}, Operation *domOpFilter = nullptr,
|
||||
Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false,
|
||||
bool replaceInDeallocOp = false);
|
||||
|
||||
/// Performs the same replacement as the other version above but only for the
|
||||
/// dereferencing uses of `oldMemRef` in `op`, except in cases where
|
||||
/// 'allowNonDereferencingOps' is set to true where we replace the
|
||||
/// non-dereferencing uses as well.
|
||||
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
|
||||
Operation *op,
|
||||
ArrayRef<Value> extraIndices = {},
|
||||
AffineMap indexRemap = AffineMap(),
|
||||
ArrayRef<Value> extraOperands = {},
|
||||
ArrayRef<Value> symbolOperands = {},
|
||||
bool allowNonDereferencingOps = false);
|
||||
|
||||
/// Rewrites the memref defined by this alloc op to have an identity layout map
|
||||
/// and updates all its indexing uses. Returns failure if any of its uses
|
||||
/// escape (while leaving the IR in a valid state).
|
||||
LogicalResult normalizeMemRef(memref::AllocOp *op);
|
||||
|
||||
/// Uses the old memref type map layout and computes the new memref type to have
|
||||
/// a new shape and a layout map, where the old layout map has been normalized
|
||||
/// to an identity layout map. It returns the old memref in case no
|
||||
/// normalization was needed or a failure occurs while transforming the old map
|
||||
/// layout to an identity layout map.
|
||||
MemRefType normalizeMemRefType(MemRefType memrefType, OpBuilder builder,
|
||||
unsigned numSymbolicOperands);
|
||||
|
||||
/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
|
||||
/// its results equal to the number of operands, as a composition
|
||||
/// of all other AffineApplyOps reachable from input parameter 'operands'. If
|
||||
/// different operands were drawing results from multiple affine apply ops,
|
||||
/// these will also be collected into a single (multi-result) affine apply op.
|
||||
/// The final results of the composed AffineApplyOp are returned in output
|
||||
/// parameter 'results'. Returns the affine apply op created.
|
||||
Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc,
|
||||
ArrayRef<Value> operands,
|
||||
ArrayRef<Operation *> affineApplyOps,
|
||||
SmallVectorImpl<Value> *results);
|
||||
|
||||
/// Given an operation, inserts one or more single result affine apply
|
||||
/// operations, results of which are exclusively used by this operation.
|
||||
/// The operands of these newly created affine apply ops are
|
||||
/// guaranteed to be loop iterators or terminal symbols of a function.
|
||||
///
|
||||
/// Before
|
||||
///
|
||||
/// affine.for %i = 0 to #map(%N)
|
||||
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// send %A[%idx], ...
|
||||
/// %v = "compute"(%idx, ...)
|
||||
///
|
||||
/// After
|
||||
///
|
||||
/// affine.for %i = 0 to #map(%N)
|
||||
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// send %A[%idx], ...
|
||||
/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// %v = "compute"(%idx_, ...)
|
||||
|
||||
/// This allows the application of different transformations on send and
|
||||
/// compute (for eg. different shifts/delays)
|
||||
///
|
||||
/// Fills `sliceOps` with the list of affine.apply operations.
|
||||
/// In the following cases, `sliceOps` remains empty:
|
||||
/// 1. If none of opInst's operands were the result of an affine.apply
|
||||
/// (i.e., there was no affine computation slice to create).
|
||||
/// 2. If all the affine.apply op's supplying operands to this opInst did not
|
||||
/// have any uses other than those in this opInst.
|
||||
void createAffineComputationSlice(Operation *opInst,
|
||||
SmallVectorImpl<AffineApplyOp> *sliceOps);
|
||||
|
||||
/// Given a list of regions, perform control flow sinking on them. For each
|
||||
/// region, control-flow sinking moves operations that dominate the region but
|
||||
/// whose only users are in the region into the regions so that they aren't
|
||||
/// executed on paths where their results are not needed.
|
||||
///
|
||||
/// TODO: For the moment, this is a *simple* control-flow sink, i.e., no
|
||||
/// duplicating of ops. It should be made to accept a cost model to determine
|
||||
/// whether duplicating a particular op is profitable.
|
||||
///
|
||||
/// Example:
|
||||
///
|
||||
/// ```mlir
|
||||
/// %0 = arith.addi %arg0, %arg1
|
||||
/// scf.if %cond {
|
||||
/// scf.yield %0
|
||||
/// } else {
|
||||
/// scf.yield %arg2
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// After control-flow sink:
|
||||
///
|
||||
/// ```mlir
|
||||
/// scf.if %cond {
|
||||
/// %0 = arith.addi %arg0, %arg1
|
||||
/// scf.yield %0
|
||||
/// } else {
|
||||
/// scf.yield %arg2
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// Users must supply a callback `shouldMoveIntoRegion` that determines whether
|
||||
/// the given operation that only has users in the given operation should be
|
||||
/// moved into that region.
|
||||
///
|
||||
/// Returns the number of operations sunk.
|
||||
size_t
|
||||
controlFlowSink(ArrayRef<Region *> regions, DominanceInfo &domInfo,
|
||||
function_ref<bool(Operation *, Region *)> shouldMoveIntoRegion);
|
||||
|
||||
/// Populates `regions` with regions of the provided region branch op that are
|
||||
/// executed at most once at that are reachable given the current operands of
|
||||
/// the op. These regions can be passed to `controlFlowSink` to perform sinking
|
||||
/// on the regions of the operation.
|
||||
void getSinglyExecutedRegionsToSink(RegionBranchOpInterface branch,
|
||||
SmallVectorImpl<Region *> ®ions);
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_TRANSFORMS_UTILS_H
|
||||
@@ -27,7 +27,6 @@
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/RegionUtils.h"
|
||||
#include "llvm/ADT/Sequence.h"
|
||||
|
||||
@@ -23,7 +23,6 @@
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::scf;
|
||||
|
||||
@@ -33,7 +33,6 @@
|
||||
#include "mlir/Support/MathExtras.h"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "llvm/ADT/TypeSwitch.h"
|
||||
#include "llvm/IR/DerivedTypes.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
|
||||
#include "../PassDetail.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
|
||||
@@ -22,12 +22,12 @@
|
||||
#include "PassDetail.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "llvm/ADT/MapVector.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
@@ -17,13 +17,13 @@
|
||||
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
#include "mlir/IR/AffineMap.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/DenseSet.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
|
||||
@@ -18,10 +18,10 @@
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h.inc"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include <deque>
|
||||
|
||||
|
||||
@@ -4,9 +4,12 @@ add_mlir_dialect_library(MLIRAffineTransforms
|
||||
AffineLoopNormalize.cpp
|
||||
AffineParallelize.cpp
|
||||
AffineScalarReplacement.cpp
|
||||
LoopCoalescing.cpp
|
||||
LoopFusion.cpp
|
||||
LoopTiling.cpp
|
||||
LoopUnroll.cpp
|
||||
LoopUnrollAndJam.cpp
|
||||
PipelineDataTransfer.cpp
|
||||
SuperVectorize.cpp
|
||||
SimplifyAffineStructures.cpp
|
||||
|
||||
|
||||
@@ -8,9 +8,10 @@
|
||||
|
||||
#include "PassDetail.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Dialect/SCF/Utils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/RegionUtils.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
@@ -16,14 +16,14 @@
|
||||
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopFusionUtils.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
#include "mlir/IR/AffineMap.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Transforms/LoopFusionUtils.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/DenseSet.h"
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
@@ -17,11 +17,11 @@
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/IR/BlockAndValueMapping.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
using namespace mlir;
|
||||
|
||||
@@ -12,11 +12,11 @@
|
||||
#include "PassDetail.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
#include "mlir/IR/AffineMap.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
@@ -37,12 +37,12 @@
|
||||
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
#include "mlir/IR/AffineMap.h"
|
||||
#include "mlir/IR/BlockAndValueMapping.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#ifndef DIALECT_AFFINE_TRANSFORMS_PASSDETAIL_H_
|
||||
#define DIALECT_AFFINE_TRANSFORMS_PASSDETAIL_H_
|
||||
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
|
||||
namespace mlir {
|
||||
@@ -16,6 +17,10 @@ namespace mlir {
|
||||
template <typename ConcreteDialect>
|
||||
void registerDialect(DialectRegistry ®istry);
|
||||
|
||||
namespace arith {
|
||||
class ArithmeticDialect;
|
||||
} // namespace arith
|
||||
|
||||
namespace linalg {
|
||||
class LinalgDialect;
|
||||
} // namespace linalg
|
||||
|
||||
@@ -11,17 +11,16 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "PassDetail.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Dialect/StandardOps/Utils/Utils.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
@@ -14,9 +14,9 @@
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/IR/IntegerSet.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
|
||||
#define DEBUG_TYPE "simplify-affine-structure"
|
||||
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
add_mlir_dialect_library(MLIRAffineUtils
|
||||
LoopFusionUtils.cpp
|
||||
LoopUtils.cpp
|
||||
Utils.cpp
|
||||
|
||||
ADDITIONAL_HEADER_DIRS
|
||||
@@ -7,5 +9,6 @@ add_mlir_dialect_library(MLIRAffineUtils
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRAffine
|
||||
MLIRAnalysis
|
||||
MLIRMemRef
|
||||
MLIRTransformUtils
|
||||
)
|
||||
|
||||
@@ -10,21 +10,20 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Transforms/LoopFusionUtils.h"
|
||||
|
||||
#include "mlir/Dialect/Affine/LoopFusionUtils.h"
|
||||
#include "mlir/Analysis/SliceAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
#include "mlir/IR/AffineMap.h"
|
||||
#include "mlir/IR/BlockAndValueMapping.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/Operation.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
@@ -10,14 +10,14 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Analysis/SliceAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
#include "mlir/IR/BlockAndValueMapping.h"
|
||||
@@ -25,7 +25,6 @@
|
||||
#include "mlir/Support/MathExtras.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
#include "mlir/Transforms/RegionUtils.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "llvm/ADT/MapVector.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
@@ -108,42 +107,9 @@ getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
|
||||
lb.erase();
|
||||
}
|
||||
|
||||
// Build the IR that performs ceil division of a positive value by a constant:
|
||||
// ceildiv(a, B) = divis(a + (B-1), B)
|
||||
// where divis is rounding-to-zero division.
|
||||
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
|
||||
int64_t divisor) {
|
||||
assert(divisor > 0 && "expected positive divisor");
|
||||
assert(dividend.getType().isIndex() && "expected index-typed value");
|
||||
|
||||
Value divisorMinusOneCst =
|
||||
builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
|
||||
Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
|
||||
Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
|
||||
return builder.create<arith::DivSIOp>(loc, sum, divisorCst);
|
||||
}
|
||||
|
||||
// Build the IR that performs ceil division of a positive value by another
|
||||
// positive value:
|
||||
// ceildiv(a, b) = divis(a + (b - 1), b)
|
||||
// where divis is rounding-to-zero division.
|
||||
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
|
||||
Value divisor) {
|
||||
assert(dividend.getType().isIndex() && "expected index-typed value");
|
||||
|
||||
Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
|
||||
Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
|
||||
Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
|
||||
return builder.create<arith::DivSIOp>(loc, sum, divisor);
|
||||
}
|
||||
|
||||
/// Helper to replace uses of loop carried values (iter_args) and loop
|
||||
/// yield values while promoting single iteration affine.for and scf.for ops.
|
||||
template <typename AffineOrSCFForOp>
|
||||
static void replaceIterArgsAndYieldResults(AffineOrSCFForOp forOp) {
|
||||
static_assert(
|
||||
llvm::is_one_of<AffineOrSCFForOp, AffineForOp, scf::ForOp>::value,
|
||||
"only for affine.for and scf.for ops");
|
||||
/// yield values while promoting single iteration affine.for ops.
|
||||
static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
|
||||
// Replace uses of iter arguments with iter operands (initial values).
|
||||
auto iterOperands = forOp.getIterOperands();
|
||||
auto iterArgs = forOp.getRegionIterArgs();
|
||||
@@ -203,46 +169,6 @@ LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Promotes the loop body of a forOp to its containing block if the forOp
|
||||
/// it can be determined that the loop has a single iteration.
|
||||
LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) {
|
||||
auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
|
||||
auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
|
||||
auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
|
||||
if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 ||
|
||||
ubCstOp.value() < 0 || stepCstOp.value() < 0)
|
||||
return failure();
|
||||
int64_t tripCount =
|
||||
mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value());
|
||||
if (tripCount != 1)
|
||||
return failure();
|
||||
auto iv = forOp.getInductionVar();
|
||||
iv.replaceAllUsesWith(lbCstOp);
|
||||
|
||||
replaceIterArgsAndYieldResults(forOp);
|
||||
|
||||
// Move the loop body operations, except for its terminator, to the loop's
|
||||
// containing block.
|
||||
auto *parentBlock = forOp->getBlock();
|
||||
forOp.getBody()->getTerminator()->erase();
|
||||
parentBlock->getOperations().splice(Block::iterator(forOp),
|
||||
forOp.getBody()->getOperations());
|
||||
forOp.erase();
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Promotes all single iteration 'for' ops in `f`, i.e., moves
|
||||
/// their body into the containing Block.
|
||||
void mlir::promoteSingleIterationLoops(FuncOp f) {
|
||||
// Gathers all innermost loops through a post order pruned walk.
|
||||
f.walk([](Operation *op) {
|
||||
if (auto forOp = dyn_cast<AffineForOp>(op))
|
||||
(void)promoteIfSingleIteration(forOp);
|
||||
else if (auto forOp = dyn_cast<scf::ForOp>(op))
|
||||
(void)promoteIfSingleIteration(forOp);
|
||||
});
|
||||
}
|
||||
|
||||
/// Generates an affine.for op with the specified lower and upper bounds
|
||||
/// while generating the right IV remappings to realize shifts for operations in
|
||||
/// its body. The operations that go into the loop body are specified in
|
||||
@@ -1011,38 +937,22 @@ mlir::tilePerfectlyNestedParametric(MutableArrayRef<AffineForOp> input,
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Collect perfectly nested loops starting from `rootForOps`. Loops are
|
||||
/// perfectly nested if each loop is the first and only non-terminator operation
|
||||
/// in the parent loop. Collect at most `maxLoops` loops and append them to
|
||||
/// `forOps`.
|
||||
template <typename T>
|
||||
static void getPerfectlyNestedLoopsImpl(
|
||||
SmallVectorImpl<T> &forOps, T rootForOp,
|
||||
unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
|
||||
for (unsigned i = 0; i < maxLoops; ++i) {
|
||||
forOps.push_back(rootForOp);
|
||||
Block &body = rootForOp.getRegion().front();
|
||||
if (body.begin() != std::prev(body.end(), 2))
|
||||
return;
|
||||
|
||||
rootForOp = dyn_cast<T>(&body.front());
|
||||
if (!rootForOp)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get perfectly nested sequence of loops starting at root of loop nest
|
||||
/// (the first op being another AffineFor, and the second op - a terminator).
|
||||
/// A loop is perfectly nested iff: the first op in the loop's body is another
|
||||
/// AffineForOp, and the second op is a terminator).
|
||||
void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
|
||||
AffineForOp root) {
|
||||
getPerfectlyNestedLoopsImpl(nestedLoops, root);
|
||||
}
|
||||
for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
|
||||
nestedLoops.push_back(root);
|
||||
Block &body = root.getRegion().front();
|
||||
if (body.begin() != std::prev(body.end(), 2))
|
||||
return;
|
||||
|
||||
void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
|
||||
scf::ForOp root) {
|
||||
getPerfectlyNestedLoopsImpl(nestedLoops, root);
|
||||
root = dyn_cast<AffineForOp>(&body.front());
|
||||
if (!root)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/// Identify valid and profitable bands of loops to tile. This is currently just
|
||||
@@ -1084,10 +994,10 @@ LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp,
|
||||
return loopUnrollByFactor(forOp, unrollFactor);
|
||||
}
|
||||
|
||||
/// Generates unrolled copies of AffineForOp or scf::ForOp 'loopBodyBlock', with
|
||||
/// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
|
||||
/// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
|
||||
/// unrolled iteration using annotateFn.
|
||||
/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
|
||||
/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
|
||||
/// unrolled body. If specified, annotates the Ops in each unrolled iteration
|
||||
/// using annotateFn.
|
||||
static void generateUnrolledLoop(
|
||||
Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
|
||||
function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
|
||||
@@ -1237,127 +1147,6 @@ LogicalResult mlir::loopUnrollByFactor(
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
|
||||
LogicalResult mlir::loopUnrollByFactor(
|
||||
scf::ForOp forOp, uint64_t unrollFactor,
|
||||
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
|
||||
assert(unrollFactor > 0 && "expected positive unroll factor");
|
||||
|
||||
// Return if the loop body is empty.
|
||||
if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
|
||||
return success();
|
||||
|
||||
// Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
|
||||
// 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
|
||||
OpBuilder boundsBuilder(forOp);
|
||||
auto loc = forOp.getLoc();
|
||||
auto step = forOp.getStep();
|
||||
Value upperBoundUnrolled;
|
||||
Value stepUnrolled;
|
||||
bool generateEpilogueLoop = true;
|
||||
|
||||
auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
|
||||
auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
|
||||
auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
|
||||
if (lbCstOp && ubCstOp && stepCstOp) {
|
||||
// Constant loop bounds computation.
|
||||
int64_t lbCst = lbCstOp.value();
|
||||
int64_t ubCst = ubCstOp.value();
|
||||
int64_t stepCst = stepCstOp.value();
|
||||
assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
|
||||
"expected positive loop bounds and step");
|
||||
int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
|
||||
|
||||
if (unrollFactor == 1) {
|
||||
if (tripCount == 1 && failed(promoteIfSingleIteration(forOp)))
|
||||
return failure();
|
||||
return success();
|
||||
}
|
||||
|
||||
int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
|
||||
int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
|
||||
assert(upperBoundUnrolledCst <= ubCst);
|
||||
int64_t stepUnrolledCst = stepCst * unrollFactor;
|
||||
|
||||
// Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
|
||||
generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
|
||||
if (generateEpilogueLoop)
|
||||
upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
|
||||
loc, upperBoundUnrolledCst);
|
||||
else
|
||||
upperBoundUnrolled = ubCstOp;
|
||||
|
||||
// Create constant for 'stepUnrolled'.
|
||||
stepUnrolled = stepCst == stepUnrolledCst
|
||||
? step
|
||||
: boundsBuilder.create<arith::ConstantIndexOp>(
|
||||
loc, stepUnrolledCst);
|
||||
} else {
|
||||
// Dynamic loop bounds computation.
|
||||
// TODO: Add dynamic asserts for negative lb/ub/step, or
|
||||
// consider using ceilDiv from AffineApplyExpander.
|
||||
auto lowerBound = forOp.getLowerBound();
|
||||
auto upperBound = forOp.getUpperBound();
|
||||
Value diff =
|
||||
boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
|
||||
Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
|
||||
Value unrollFactorCst =
|
||||
boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
|
||||
Value tripCountRem =
|
||||
boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
|
||||
// Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
|
||||
Value tripCountEvenMultiple =
|
||||
boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
|
||||
// Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
|
||||
upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
|
||||
loc, lowerBound,
|
||||
boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
|
||||
// Scale 'step' by 'unrollFactor'.
|
||||
stepUnrolled =
|
||||
boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
|
||||
}
|
||||
|
||||
// Create epilogue clean up loop starting at 'upperBoundUnrolled'.
|
||||
if (generateEpilogueLoop) {
|
||||
OpBuilder epilogueBuilder(forOp->getContext());
|
||||
epilogueBuilder.setInsertionPoint(forOp->getBlock(),
|
||||
std::next(Block::iterator(forOp)));
|
||||
auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
|
||||
epilogueForOp.setLowerBound(upperBoundUnrolled);
|
||||
|
||||
// Update uses of loop results.
|
||||
auto results = forOp.getResults();
|
||||
auto epilogueResults = epilogueForOp.getResults();
|
||||
auto epilogueIterOperands = epilogueForOp.getIterOperands();
|
||||
|
||||
for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) {
|
||||
std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
|
||||
epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
|
||||
}
|
||||
(void)promoteIfSingleIteration(epilogueForOp);
|
||||
}
|
||||
|
||||
// Create unrolled loop.
|
||||
forOp.setUpperBound(upperBoundUnrolled);
|
||||
forOp.setStep(stepUnrolled);
|
||||
|
||||
auto iterArgs = ValueRange(forOp.getRegionIterArgs());
|
||||
auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
|
||||
|
||||
generateUnrolledLoop(
|
||||
forOp.getBody(), forOp.getInductionVar(), unrollFactor,
|
||||
[&](unsigned i, Value iv, OpBuilder b) {
|
||||
// iv' = iv + step * i;
|
||||
auto stride = b.create<arith::MulIOp>(
|
||||
loc, step, b.create<arith::ConstantIndexOp>(loc, i));
|
||||
return b.create<arith::AddIOp>(loc, iv, stride);
|
||||
},
|
||||
annotateFn, iterArgs, yieldedValues);
|
||||
// Promote the loop body up if this has turned into a single iteration loop.
|
||||
(void)promoteIfSingleIteration(forOp);
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp,
|
||||
uint64_t unrollJamFactor) {
|
||||
Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
|
||||
@@ -1888,61 +1677,25 @@ stripmineSink(AffineForOp forOp, uint64_t factor,
|
||||
return innerLoops;
|
||||
}
|
||||
|
||||
static Loops stripmineSink(scf::ForOp forOp, Value factor,
|
||||
ArrayRef<scf::ForOp> targets) {
|
||||
auto originalStep = forOp.getStep();
|
||||
auto iv = forOp.getInductionVar();
|
||||
|
||||
OpBuilder b(forOp);
|
||||
forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
|
||||
|
||||
Loops innerLoops;
|
||||
for (auto t : targets) {
|
||||
// Save information for splicing ops out of t when done
|
||||
auto begin = t.getBody()->begin();
|
||||
auto nOps = t.getBody()->getOperations().size();
|
||||
|
||||
// Insert newForOp before the terminator of `t`.
|
||||
auto b = OpBuilder::atBlockTerminator((t.getBody()));
|
||||
Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
|
||||
Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt,
|
||||
forOp.getUpperBound(), stepped);
|
||||
Value ub =
|
||||
b.create<SelectOp>(t.getLoc(), less, forOp.getUpperBound(), stepped);
|
||||
|
||||
// Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
|
||||
auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
|
||||
newForOp.getBody()->getOperations().splice(
|
||||
newForOp.getBody()->getOperations().begin(),
|
||||
t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
|
||||
replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
|
||||
newForOp.getRegion());
|
||||
|
||||
innerLoops.push_back(newForOp);
|
||||
}
|
||||
|
||||
return innerLoops;
|
||||
}
|
||||
|
||||
// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
|
||||
// Returns the new AffineForOps, nested immediately under `target`.
|
||||
template <typename ForType, typename SizeType>
|
||||
static ForType stripmineSink(ForType forOp, SizeType factor, ForType target) {
|
||||
template <typename SizeType>
|
||||
static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
|
||||
AffineForOp target) {
|
||||
// TODO: Use cheap structural assertions that targets are nested under
|
||||
// forOp and that targets are not nested under each other when DominanceInfo
|
||||
// exposes the capability. It seems overkill to construct a whole function
|
||||
// dominance tree at this point.
|
||||
auto res = stripmineSink(forOp, factor, ArrayRef<ForType>{target});
|
||||
auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
|
||||
assert(res.size() == 1 && "Expected 1 inner forOp");
|
||||
return res[0];
|
||||
}
|
||||
|
||||
template <typename ForType, typename SizeType>
|
||||
static SmallVector<SmallVector<ForType, 8>, 8>
|
||||
tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes,
|
||||
ArrayRef<ForType> targets) {
|
||||
SmallVector<SmallVector<ForType, 8>, 8> res;
|
||||
SmallVector<ForType, 8> currentTargets(targets.begin(), targets.end());
|
||||
SmallVector<SmallVector<AffineForOp, 8>, 8>
|
||||
mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
|
||||
ArrayRef<AffineForOp> targets) {
|
||||
SmallVector<SmallVector<AffineForOp, 8>, 8> res;
|
||||
SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end());
|
||||
for (auto it : llvm::zip(forOps, sizes)) {
|
||||
auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
|
||||
res.push_back(step);
|
||||
@@ -1951,288 +1704,17 @@ tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes,
|
||||
return res;
|
||||
}
|
||||
|
||||
SmallVector<SmallVector<AffineForOp, 8>, 8>
|
||||
mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
|
||||
ArrayRef<AffineForOp> targets) {
|
||||
return tileImpl(forOps, sizes, targets);
|
||||
}
|
||||
|
||||
SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,
|
||||
ArrayRef<Value> sizes,
|
||||
ArrayRef<scf::ForOp> targets) {
|
||||
return tileImpl(forOps, sizes, targets);
|
||||
}
|
||||
|
||||
template <typename ForType, typename SizeType>
|
||||
static SmallVector<ForType, 8>
|
||||
tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes, ForType target) {
|
||||
SmallVector<ForType, 8> res;
|
||||
for (auto loops : tile(forOps, sizes, ArrayRef<ForType>{target})) {
|
||||
SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
|
||||
ArrayRef<uint64_t> sizes,
|
||||
AffineForOp target) {
|
||||
SmallVector<AffineForOp, 8> res;
|
||||
for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) {
|
||||
assert(loops.size() == 1);
|
||||
res.push_back(loops[0]);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
|
||||
ArrayRef<uint64_t> sizes,
|
||||
AffineForOp target) {
|
||||
return tileImpl(forOps, sizes, target);
|
||||
}
|
||||
|
||||
Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
|
||||
scf::ForOp target) {
|
||||
return tileImpl(forOps, sizes, target);
|
||||
}
|
||||
|
||||
Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
|
||||
// Collect perfectly nested loops. If more size values provided than nested
|
||||
// loops available, truncate `sizes`.
|
||||
SmallVector<scf::ForOp, 4> forOps;
|
||||
forOps.reserve(sizes.size());
|
||||
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
|
||||
if (forOps.size() < sizes.size())
|
||||
sizes = sizes.take_front(forOps.size());
|
||||
|
||||
return ::tile(forOps, sizes, forOps.back());
|
||||
}
|
||||
|
||||
// Hoist the ops within `outer` that appear before `inner`.
|
||||
// Such ops include the ops that have been introduced by parametric tiling.
|
||||
// Ops that come from triangular loops (i.e. that belong to the program slice
|
||||
// rooted at `outer`) and ops that have side effects cannot be hoisted.
|
||||
// Return failure when any op fails to hoist.
|
||||
static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
|
||||
SetVector<Operation *> forwardSlice;
|
||||
getForwardSlice(
|
||||
outer.getInductionVar(), &forwardSlice,
|
||||
[&inner](Operation *op) { return op != inner.getOperation(); });
|
||||
LogicalResult status = success();
|
||||
SmallVector<Operation *, 8> toHoist;
|
||||
for (auto &op : outer.getBody()->without_terminator()) {
|
||||
// Stop when encountering the inner loop.
|
||||
if (&op == inner.getOperation())
|
||||
break;
|
||||
// Skip over non-hoistable ops.
|
||||
if (forwardSlice.count(&op) > 0) {
|
||||
status = failure();
|
||||
continue;
|
||||
}
|
||||
// Skip intermediate scf::ForOp, these are not considered a failure.
|
||||
if (isa<scf::ForOp>(op))
|
||||
continue;
|
||||
// Skip other ops with regions.
|
||||
if (op.getNumRegions() > 0) {
|
||||
status = failure();
|
||||
continue;
|
||||
}
|
||||
// Skip if op has side effects.
|
||||
// TODO: loads to immutable memory regions are ok.
|
||||
if (!MemoryEffectOpInterface::hasNoEffect(&op)) {
|
||||
status = failure();
|
||||
continue;
|
||||
}
|
||||
toHoist.push_back(&op);
|
||||
}
|
||||
auto *outerForOp = outer.getOperation();
|
||||
for (auto *op : toHoist)
|
||||
op->moveBefore(outerForOp);
|
||||
return status;
|
||||
}
|
||||
|
||||
// Traverse the interTile and intraTile loops and try to hoist ops such that
|
||||
// bands of perfectly nested loops are isolated.
|
||||
// Return failure if either perfect interTile or perfect intraTile bands cannot
|
||||
// be formed.
|
||||
static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
|
||||
LogicalResult status = success();
|
||||
const Loops &interTile = tileLoops.first;
|
||||
const Loops &intraTile = tileLoops.second;
|
||||
auto size = interTile.size();
|
||||
assert(size == intraTile.size());
|
||||
if (size <= 1)
|
||||
return success();
|
||||
for (unsigned s = 1; s < size; ++s)
|
||||
status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
|
||||
: failure();
|
||||
for (unsigned s = 1; s < size; ++s)
|
||||
status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
|
||||
: failure();
|
||||
return status;
|
||||
}
|
||||
|
||||
TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
|
||||
ArrayRef<int64_t> sizes) {
|
||||
// Collect perfectly nested loops. If more size values provided than nested
|
||||
// loops available, truncate `sizes`.
|
||||
SmallVector<scf::ForOp, 4> forOps;
|
||||
forOps.reserve(sizes.size());
|
||||
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
|
||||
if (forOps.size() < sizes.size())
|
||||
sizes = sizes.take_front(forOps.size());
|
||||
|
||||
// Compute the tile sizes such that i-th outer loop executes size[i]
|
||||
// iterations. Given that the loop current executes
|
||||
// numIterations = ceildiv((upperBound - lowerBound), step)
|
||||
// iterations, we need to tile with size ceildiv(numIterations, size[i]).
|
||||
SmallVector<Value, 4> tileSizes;
|
||||
tileSizes.reserve(sizes.size());
|
||||
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
|
||||
assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
|
||||
|
||||
auto forOp = forOps[i];
|
||||
OpBuilder builder(forOp);
|
||||
auto loc = forOp.getLoc();
|
||||
Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
|
||||
forOp.getLowerBound());
|
||||
Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
|
||||
Value iterationsPerBlock =
|
||||
ceilDivPositive(builder, loc, numIterations, sizes[i]);
|
||||
tileSizes.push_back(iterationsPerBlock);
|
||||
}
|
||||
|
||||
// Call parametric tiling with the given sizes.
|
||||
auto intraTile = tile(forOps, tileSizes, forOps.back());
|
||||
TileLoops tileLoops = std::make_pair(forOps, intraTile);
|
||||
|
||||
// TODO: for now we just ignore the result of band isolation.
|
||||
// In the future, mapping decisions may be impacted by the ability to
|
||||
// isolate perfectly nested bands.
|
||||
(void)tryIsolateBands(tileLoops);
|
||||
|
||||
return tileLoops;
|
||||
}
|
||||
|
||||
/// Return the new lower bound, upper bound, and step in that order. Insert any
|
||||
/// additional bounds calculations before the given builder and any additional
|
||||
/// conversion back to the original loop induction value inside the given Block.
|
||||
static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
|
||||
OpBuilder &insideLoopBuilder, Location loc,
|
||||
Value lowerBound, Value upperBound, Value step,
|
||||
Value inductionVar) {
|
||||
// Check if the loop is already known to have a constant zero lower bound or
|
||||
// a constant one step.
|
||||
bool isZeroBased = false;
|
||||
if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>())
|
||||
isZeroBased = ubCst.value() == 0;
|
||||
|
||||
bool isStepOne = false;
|
||||
if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>())
|
||||
isStepOne = stepCst.value() == 1;
|
||||
|
||||
// Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
|
||||
// assuming the step is strictly positive. Update the bounds and the step
|
||||
// of the loop to go from 0 to the number of iterations, if necessary.
|
||||
// TODO: introduce support for negative steps or emit dynamic asserts
|
||||
// on step positivity, whatever gets implemented first.
|
||||
if (isZeroBased && isStepOne)
|
||||
return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
|
||||
/*step=*/step};
|
||||
|
||||
Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
|
||||
Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
|
||||
|
||||
Value newLowerBound =
|
||||
isZeroBased ? lowerBound
|
||||
: boundsBuilder.create<arith::ConstantIndexOp>(loc, 0);
|
||||
Value newStep =
|
||||
isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1);
|
||||
|
||||
// Insert code computing the value of the original loop induction variable
|
||||
// from the "normalized" one.
|
||||
Value scaled =
|
||||
isStepOne
|
||||
? inductionVar
|
||||
: insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
|
||||
Value shifted =
|
||||
isZeroBased
|
||||
? scaled
|
||||
: insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
|
||||
|
||||
SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
|
||||
shifted.getDefiningOp()};
|
||||
inductionVar.replaceAllUsesExcept(shifted, preserve);
|
||||
return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
|
||||
/*step=*/newStep};
|
||||
}
|
||||
|
||||
/// Transform a loop with a strictly positive step
|
||||
/// for %i = %lb to %ub step %s
|
||||
/// into a 0-based loop with step 1
|
||||
/// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
|
||||
/// %i = %ii * %s + %lb
|
||||
/// Insert the induction variable remapping in the body of `inner`, which is
|
||||
/// expected to be either `loop` or another loop perfectly nested under `loop`.
|
||||
/// Insert the definition of new bounds immediate before `outer`, which is
|
||||
/// expected to be either `loop` or its parent in the loop nest.
|
||||
static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
|
||||
OpBuilder builder(outer);
|
||||
OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
|
||||
auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
|
||||
loop.getLowerBound(), loop.getUpperBound(),
|
||||
loop.getStep(), loop.getInductionVar());
|
||||
|
||||
loop.setLowerBound(loopPieces.lowerBound);
|
||||
loop.setUpperBound(loopPieces.upperBound);
|
||||
loop.setStep(loopPieces.step);
|
||||
}
|
||||
|
||||
void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
|
||||
if (loops.size() < 2)
|
||||
return;
|
||||
|
||||
scf::ForOp innermost = loops.back();
|
||||
scf::ForOp outermost = loops.front();
|
||||
|
||||
// 1. Make sure all loops iterate from 0 to upperBound with step 1. This
|
||||
// allows the following code to assume upperBound is the number of iterations.
|
||||
for (auto loop : loops)
|
||||
normalizeLoop(loop, outermost, innermost);
|
||||
|
||||
// 2. Emit code computing the upper bound of the coalesced loop as product
|
||||
// of the number of iterations of all loops.
|
||||
OpBuilder builder(outermost);
|
||||
Location loc = outermost.getLoc();
|
||||
Value upperBound = outermost.getUpperBound();
|
||||
for (auto loop : loops.drop_front())
|
||||
upperBound =
|
||||
builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
|
||||
outermost.setUpperBound(upperBound);
|
||||
|
||||
builder.setInsertionPointToStart(outermost.getBody());
|
||||
|
||||
// 3. Remap induction variables. For each original loop, the value of the
|
||||
// induction variable can be obtained by dividing the induction variable of
|
||||
// the linearized loop by the total number of iterations of the loops nested
|
||||
// in it modulo the number of iterations in this loop (remove the values
|
||||
// related to the outer loops):
|
||||
// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
|
||||
// Compute these iteratively from the innermost loop by creating a "running
|
||||
// quotient" of division by the range.
|
||||
Value previous = outermost.getInductionVar();
|
||||
for (unsigned i = 0, e = loops.size(); i < e; ++i) {
|
||||
unsigned idx = loops.size() - i - 1;
|
||||
if (i != 0)
|
||||
previous = builder.create<arith::DivSIOp>(loc, previous,
|
||||
loops[idx + 1].getUpperBound());
|
||||
|
||||
Value iv = (i == e - 1) ? previous
|
||||
: builder.create<arith::RemSIOp>(
|
||||
loc, previous, loops[idx].getUpperBound());
|
||||
replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
|
||||
loops.back().getRegion());
|
||||
}
|
||||
|
||||
// 4. Move the operations from the innermost just above the second-outermost
|
||||
// loop, delete the extra terminator and the second-outermost loop.
|
||||
scf::ForOp second = loops[1];
|
||||
innermost.getBody()->back().erase();
|
||||
outermost.getBody()->getOperations().splice(
|
||||
Block::iterator(second.getOperation()),
|
||||
innermost.getBody()->getOperations());
|
||||
second.erase();
|
||||
}
|
||||
|
||||
LogicalResult mlir::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
|
||||
if (loops.size() < 2)
|
||||
return success();
|
||||
@@ -2347,89 +1829,6 @@ LogicalResult mlir::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
|
||||
return success();
|
||||
}
|
||||
|
||||
void mlir::collapseParallelLoops(
|
||||
scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
|
||||
OpBuilder outsideBuilder(loops);
|
||||
Location loc = loops.getLoc();
|
||||
|
||||
// Presort combined dimensions.
|
||||
auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
|
||||
for (auto &dims : sortedDimensions)
|
||||
std::sort(dims.begin(), dims.end());
|
||||
|
||||
// Normalize ParallelOp's iteration pattern.
|
||||
SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
|
||||
normalizedUpperBounds;
|
||||
for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
|
||||
OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
|
||||
auto resultBounds =
|
||||
normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
|
||||
loops.getLowerBound()[i], loops.getUpperBound()[i],
|
||||
loops.getStep()[i], loops.getBody()->getArgument(i));
|
||||
|
||||
normalizedLowerBounds.push_back(resultBounds.lowerBound);
|
||||
normalizedUpperBounds.push_back(resultBounds.upperBound);
|
||||
normalizedSteps.push_back(resultBounds.step);
|
||||
}
|
||||
|
||||
// Combine iteration spaces.
|
||||
SmallVector<Value, 3> lowerBounds, upperBounds, steps;
|
||||
auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
|
||||
auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
|
||||
for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
|
||||
Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
|
||||
for (auto idx : sortedDimensions[i]) {
|
||||
newUpperBound = outsideBuilder.create<arith::MulIOp>(
|
||||
loc, newUpperBound, normalizedUpperBounds[idx]);
|
||||
}
|
||||
lowerBounds.push_back(cst0);
|
||||
steps.push_back(cst1);
|
||||
upperBounds.push_back(newUpperBound);
|
||||
}
|
||||
|
||||
// Create new ParallelLoop with conversions to the original induction values.
|
||||
// The loop below uses divisions to get the relevant range of values in the
|
||||
// new induction value that represent each range of the original induction
|
||||
// value. The remainders then determine based on that range, which iteration
|
||||
// of the original induction value this represents. This is a normalized value
|
||||
// that is un-normalized already by the previous logic.
|
||||
auto newPloop = outsideBuilder.create<scf::ParallelOp>(
|
||||
loc, lowerBounds, upperBounds, steps,
|
||||
[&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
|
||||
for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
|
||||
Value previous = ploopIVs[i];
|
||||
unsigned numberCombinedDimensions = combinedDimensions[i].size();
|
||||
// Iterate over all except the last induction value.
|
||||
for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
|
||||
unsigned idx = combinedDimensions[i][j];
|
||||
|
||||
// Determine the current induction value's current loop iteration
|
||||
Value iv = insideBuilder.create<arith::RemSIOp>(
|
||||
loc, previous, normalizedUpperBounds[idx]);
|
||||
replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
|
||||
loops.getRegion());
|
||||
|
||||
// Remove the effect of the current induction value to prepare for
|
||||
// the next value.
|
||||
previous = insideBuilder.create<arith::DivSIOp>(
|
||||
loc, previous, normalizedUpperBounds[idx]);
|
||||
}
|
||||
|
||||
// The final induction value is just the remaining value.
|
||||
unsigned idx = combinedDimensions[i][0];
|
||||
replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
|
||||
previous, loops.getRegion());
|
||||
}
|
||||
});
|
||||
|
||||
// Replace the old loop with the new loop.
|
||||
loops.getBody()->back().erase();
|
||||
newPloop.getBody()->getOperations().splice(
|
||||
Block::iterator(newPloop.getBody()->back()),
|
||||
loops.getBody()->getOperations());
|
||||
loops.erase();
|
||||
}
|
||||
|
||||
void mlir::mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId,
|
||||
ArrayRef<Value> numProcessors) {
|
||||
assert(processorId.size() == numProcessors.size());
|
||||
@@ -16,12 +16,14 @@
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/IR/BlockAndValueMapping.h"
|
||||
#include "mlir/IR/Dominance.h"
|
||||
#include "mlir/IR/IntegerSet.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
|
||||
#define DEBUG_TYPE "affine-utils"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
@@ -856,3 +858,740 @@ void mlir::affineScalarReplace(FuncOp f, DominanceInfo &domInfo,
|
||||
defOp->erase();
|
||||
}
|
||||
}
|
||||
|
||||
// Perform the replacement in `op`.
|
||||
LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
|
||||
Operation *op,
|
||||
ArrayRef<Value> extraIndices,
|
||||
AffineMap indexRemap,
|
||||
ArrayRef<Value> extraOperands,
|
||||
ArrayRef<Value> symbolOperands,
|
||||
bool allowNonDereferencingOps) {
|
||||
unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
|
||||
(void)newMemRefRank; // unused in opt mode
|
||||
unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
|
||||
(void)oldMemRefRank; // unused in opt mode
|
||||
if (indexRemap) {
|
||||
assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
|
||||
"symbolic operand count mismatch");
|
||||
assert(indexRemap.getNumInputs() ==
|
||||
extraOperands.size() + oldMemRefRank + symbolOperands.size());
|
||||
assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
|
||||
} else {
|
||||
assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
|
||||
}
|
||||
|
||||
// Assert same elemental type.
|
||||
assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
|
||||
newMemRef.getType().cast<MemRefType>().getElementType());
|
||||
|
||||
SmallVector<unsigned, 2> usePositions;
|
||||
for (const auto &opEntry : llvm::enumerate(op->getOperands())) {
|
||||
if (opEntry.value() == oldMemRef)
|
||||
usePositions.push_back(opEntry.index());
|
||||
}
|
||||
|
||||
// If memref doesn't appear, nothing to do.
|
||||
if (usePositions.empty())
|
||||
return success();
|
||||
|
||||
if (usePositions.size() > 1) {
|
||||
// TODO: extend it for this case when needed (rare).
|
||||
assert(false && "multiple dereferencing uses in a single op not supported");
|
||||
return failure();
|
||||
}
|
||||
|
||||
unsigned memRefOperandPos = usePositions.front();
|
||||
|
||||
OpBuilder builder(op);
|
||||
// The following checks if op is dereferencing memref and performs the access
|
||||
// index rewrites.
|
||||
auto affMapAccInterface = dyn_cast<AffineMapAccessInterface>(op);
|
||||
if (!affMapAccInterface) {
|
||||
if (!allowNonDereferencingOps) {
|
||||
// Failure: memref used in a non-dereferencing context (potentially
|
||||
// escapes); no replacement in these cases unless allowNonDereferencingOps
|
||||
// is set.
|
||||
return failure();
|
||||
}
|
||||
op->setOperand(memRefOperandPos, newMemRef);
|
||||
return success();
|
||||
}
|
||||
// Perform index rewrites for the dereferencing op and then replace the op
|
||||
NamedAttribute oldMapAttrPair =
|
||||
affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef);
|
||||
AffineMap oldMap = oldMapAttrPair.getValue().cast<AffineMapAttr>().getValue();
|
||||
unsigned oldMapNumInputs = oldMap.getNumInputs();
|
||||
SmallVector<Value, 4> oldMapOperands(
|
||||
op->operand_begin() + memRefOperandPos + 1,
|
||||
op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs);
|
||||
|
||||
// Apply 'oldMemRefOperands = oldMap(oldMapOperands)'.
|
||||
SmallVector<Value, 4> oldMemRefOperands;
|
||||
SmallVector<Value, 4> affineApplyOps;
|
||||
oldMemRefOperands.reserve(oldMemRefRank);
|
||||
if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) {
|
||||
for (auto resultExpr : oldMap.getResults()) {
|
||||
auto singleResMap = AffineMap::get(oldMap.getNumDims(),
|
||||
oldMap.getNumSymbols(), resultExpr);
|
||||
auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
|
||||
oldMapOperands);
|
||||
oldMemRefOperands.push_back(afOp);
|
||||
affineApplyOps.push_back(afOp);
|
||||
}
|
||||
} else {
|
||||
oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end());
|
||||
}
|
||||
|
||||
// Construct new indices as a remap of the old ones if a remapping has been
|
||||
// provided. The indices of a memref come right after it, i.e.,
|
||||
// at position memRefOperandPos + 1.
|
||||
SmallVector<Value, 4> remapOperands;
|
||||
remapOperands.reserve(extraOperands.size() + oldMemRefRank +
|
||||
symbolOperands.size());
|
||||
remapOperands.append(extraOperands.begin(), extraOperands.end());
|
||||
remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end());
|
||||
remapOperands.append(symbolOperands.begin(), symbolOperands.end());
|
||||
|
||||
SmallVector<Value, 4> remapOutputs;
|
||||
remapOutputs.reserve(oldMemRefRank);
|
||||
|
||||
if (indexRemap &&
|
||||
indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
|
||||
// Remapped indices.
|
||||
for (auto resultExpr : indexRemap.getResults()) {
|
||||
auto singleResMap = AffineMap::get(
|
||||
indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr);
|
||||
auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
|
||||
remapOperands);
|
||||
remapOutputs.push_back(afOp);
|
||||
affineApplyOps.push_back(afOp);
|
||||
}
|
||||
} else {
|
||||
// No remapping specified.
|
||||
remapOutputs.assign(remapOperands.begin(), remapOperands.end());
|
||||
}
|
||||
|
||||
SmallVector<Value, 4> newMapOperands;
|
||||
newMapOperands.reserve(newMemRefRank);
|
||||
|
||||
// Prepend 'extraIndices' in 'newMapOperands'.
|
||||
for (Value extraIndex : extraIndices) {
|
||||
assert(extraIndex.getDefiningOp()->getNumResults() == 1 &&
|
||||
"single result op's expected to generate these indices");
|
||||
assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) &&
|
||||
"invalid memory op index");
|
||||
newMapOperands.push_back(extraIndex);
|
||||
}
|
||||
|
||||
// Append 'remapOutputs' to 'newMapOperands'.
|
||||
newMapOperands.append(remapOutputs.begin(), remapOutputs.end());
|
||||
|
||||
// Create new fully composed AffineMap for new op to be created.
|
||||
assert(newMapOperands.size() == newMemRefRank);
|
||||
auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
|
||||
// TODO: Avoid creating/deleting temporary AffineApplyOps here.
|
||||
fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
|
||||
newMap = simplifyAffineMap(newMap);
|
||||
canonicalizeMapAndOperands(&newMap, &newMapOperands);
|
||||
// Remove any affine.apply's that became dead as a result of composition.
|
||||
for (Value value : affineApplyOps)
|
||||
if (value.use_empty())
|
||||
value.getDefiningOp()->erase();
|
||||
|
||||
OperationState state(op->getLoc(), op->getName());
|
||||
// Construct the new operation using this memref.
|
||||
state.operands.reserve(op->getNumOperands() + extraIndices.size());
|
||||
// Insert the non-memref operands.
|
||||
state.operands.append(op->operand_begin(),
|
||||
op->operand_begin() + memRefOperandPos);
|
||||
// Insert the new memref value.
|
||||
state.operands.push_back(newMemRef);
|
||||
|
||||
// Insert the new memref map operands.
|
||||
state.operands.append(newMapOperands.begin(), newMapOperands.end());
|
||||
|
||||
// Insert the remaining operands unmodified.
|
||||
state.operands.append(op->operand_begin() + memRefOperandPos + 1 +
|
||||
oldMapNumInputs,
|
||||
op->operand_end());
|
||||
|
||||
// Result types don't change. Both memref's are of the same elemental type.
|
||||
state.types.reserve(op->getNumResults());
|
||||
for (auto result : op->getResults())
|
||||
state.types.push_back(result.getType());
|
||||
|
||||
// Add attribute for 'newMap', other Attributes do not change.
|
||||
auto newMapAttr = AffineMapAttr::get(newMap);
|
||||
for (auto namedAttr : op->getAttrs()) {
|
||||
if (namedAttr.getName() == oldMapAttrPair.getName())
|
||||
state.attributes.push_back({namedAttr.getName(), newMapAttr});
|
||||
else
|
||||
state.attributes.push_back(namedAttr);
|
||||
}
|
||||
|
||||
// Create the new operation.
|
||||
auto *repOp = builder.createOperation(state);
|
||||
op->replaceAllUsesWith(repOp);
|
||||
op->erase();
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult mlir::replaceAllMemRefUsesWith(
|
||||
Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices,
|
||||
AffineMap indexRemap, ArrayRef<Value> extraOperands,
|
||||
ArrayRef<Value> symbolOperands, Operation *domOpFilter,
|
||||
Operation *postDomOpFilter, bool allowNonDereferencingOps,
|
||||
bool replaceInDeallocOp) {
|
||||
unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
|
||||
(void)newMemRefRank; // unused in opt mode
|
||||
unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
|
||||
(void)oldMemRefRank;
|
||||
if (indexRemap) {
|
||||
assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
|
||||
"symbol operand count mismatch");
|
||||
assert(indexRemap.getNumInputs() ==
|
||||
extraOperands.size() + oldMemRefRank + symbolOperands.size());
|
||||
assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
|
||||
} else {
|
||||
assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
|
||||
}
|
||||
|
||||
// Assert same elemental type.
|
||||
assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
|
||||
newMemRef.getType().cast<MemRefType>().getElementType());
|
||||
|
||||
std::unique_ptr<DominanceInfo> domInfo;
|
||||
std::unique_ptr<PostDominanceInfo> postDomInfo;
|
||||
if (domOpFilter)
|
||||
domInfo =
|
||||
std::make_unique<DominanceInfo>(domOpFilter->getParentOfType<FuncOp>());
|
||||
|
||||
if (postDomOpFilter)
|
||||
postDomInfo = std::make_unique<PostDominanceInfo>(
|
||||
postDomOpFilter->getParentOfType<FuncOp>());
|
||||
|
||||
// Walk all uses of old memref; collect ops to perform replacement. We use a
|
||||
// DenseSet since an operation could potentially have multiple uses of a
|
||||
// memref (although rare), and the replacement later is going to erase ops.
|
||||
DenseSet<Operation *> opsToReplace;
|
||||
for (auto *op : oldMemRef.getUsers()) {
|
||||
// Skip this use if it's not dominated by domOpFilter.
|
||||
if (domOpFilter && !domInfo->dominates(domOpFilter, op))
|
||||
continue;
|
||||
|
||||
// Skip this use if it's not post-dominated by postDomOpFilter.
|
||||
if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op))
|
||||
continue;
|
||||
|
||||
// Skip dealloc's - no replacement is necessary, and a memref replacement
|
||||
// at other uses doesn't hurt these dealloc's.
|
||||
if (isa<memref::DeallocOp>(op) && !replaceInDeallocOp)
|
||||
continue;
|
||||
|
||||
// Check if the memref was used in a non-dereferencing context. It is fine
|
||||
// for the memref to be used in a non-dereferencing way outside of the
|
||||
// region where this replacement is happening.
|
||||
if (!isa<AffineMapAccessInterface>(*op)) {
|
||||
if (!allowNonDereferencingOps) {
|
||||
LLVM_DEBUG(llvm::dbgs()
|
||||
<< "Memref replacement failed: non-deferencing memref op: \n"
|
||||
<< *op << '\n');
|
||||
return failure();
|
||||
}
|
||||
// Non-dereferencing ops with the MemRefsNormalizable trait are
|
||||
// supported for replacement.
|
||||
if (!op->hasTrait<OpTrait::MemRefsNormalizable>()) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a "
|
||||
"memrefs normalizable trait: \n"
|
||||
<< *op << '\n');
|
||||
return failure();
|
||||
}
|
||||
}
|
||||
|
||||
// We'll first collect and then replace --- since replacement erases the op
|
||||
// that has the use, and that op could be postDomFilter or domFilter itself!
|
||||
opsToReplace.insert(op);
|
||||
}
|
||||
|
||||
for (auto *op : opsToReplace) {
|
||||
if (failed(replaceAllMemRefUsesWith(
|
||||
oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands,
|
||||
symbolOperands, allowNonDereferencingOps)))
|
||||
llvm_unreachable("memref replacement guaranteed to succeed here");
|
||||
}
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Given an operation, inserts one or more single result affine
|
||||
/// apply operations, results of which are exclusively used by this operation
|
||||
/// operation. The operands of these newly created affine apply ops are
|
||||
/// guaranteed to be loop iterators or terminal symbols of a function.
|
||||
///
|
||||
/// Before
|
||||
///
|
||||
/// affine.for %i = 0 to #map(%N)
|
||||
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// "send"(%idx, %A, ...)
|
||||
/// "compute"(%idx)
|
||||
///
|
||||
/// After
|
||||
///
|
||||
/// affine.for %i = 0 to #map(%N)
|
||||
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// "send"(%idx, %A, ...)
|
||||
/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// "compute"(%idx_)
|
||||
///
|
||||
/// This allows applying different transformations on send and compute (for eg.
|
||||
/// different shifts/delays).
|
||||
///
|
||||
/// Returns nullptr either if none of opInst's operands were the result of an
|
||||
/// affine.apply and thus there was no affine computation slice to create, or if
|
||||
/// all the affine.apply op's supplying operands to this opInst did not have any
|
||||
/// uses besides this opInst; otherwise returns the list of affine.apply
|
||||
/// operations created in output argument `sliceOps`.
|
||||
void mlir::createAffineComputationSlice(
|
||||
Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) {
|
||||
// Collect all operands that are results of affine apply ops.
|
||||
SmallVector<Value, 4> subOperands;
|
||||
subOperands.reserve(opInst->getNumOperands());
|
||||
for (auto operand : opInst->getOperands())
|
||||
if (isa_and_nonnull<AffineApplyOp>(operand.getDefiningOp()))
|
||||
subOperands.push_back(operand);
|
||||
|
||||
// Gather sequence of AffineApplyOps reachable from 'subOperands'.
|
||||
SmallVector<Operation *, 4> affineApplyOps;
|
||||
getReachableAffineApplyOps(subOperands, affineApplyOps);
|
||||
// Skip transforming if there are no affine maps to compose.
|
||||
if (affineApplyOps.empty())
|
||||
return;
|
||||
|
||||
// Check if all uses of the affine apply op's lie only in this op op, in
|
||||
// which case there would be nothing to do.
|
||||
bool localized = true;
|
||||
for (auto *op : affineApplyOps) {
|
||||
for (auto result : op->getResults()) {
|
||||
for (auto *user : result.getUsers()) {
|
||||
if (user != opInst) {
|
||||
localized = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (localized)
|
||||
return;
|
||||
|
||||
OpBuilder builder(opInst);
|
||||
SmallVector<Value, 4> composedOpOperands(subOperands);
|
||||
auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
|
||||
fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
|
||||
|
||||
// Create an affine.apply for each of the map results.
|
||||
sliceOps->reserve(composedMap.getNumResults());
|
||||
for (auto resultExpr : composedMap.getResults()) {
|
||||
auto singleResMap = AffineMap::get(composedMap.getNumDims(),
|
||||
composedMap.getNumSymbols(), resultExpr);
|
||||
sliceOps->push_back(builder.create<AffineApplyOp>(
|
||||
opInst->getLoc(), singleResMap, composedOpOperands));
|
||||
}
|
||||
|
||||
// Construct the new operands that include the results from the composed
|
||||
// affine apply op above instead of existing ones (subOperands). So, they
|
||||
// differ from opInst's operands only for those operands in 'subOperands', for
|
||||
// which they will be replaced by the corresponding one from 'sliceOps'.
|
||||
SmallVector<Value, 4> newOperands(opInst->getOperands());
|
||||
for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
|
||||
// Replace the subOperands from among the new operands.
|
||||
unsigned j, f;
|
||||
for (j = 0, f = subOperands.size(); j < f; j++) {
|
||||
if (newOperands[i] == subOperands[j])
|
||||
break;
|
||||
}
|
||||
if (j < subOperands.size()) {
|
||||
newOperands[i] = (*sliceOps)[j];
|
||||
}
|
||||
}
|
||||
for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
|
||||
opInst->setOperand(idx, newOperands[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Enum to set patterns of affine expr in tiled-layout map.
|
||||
/// TileFloorDiv: <dim expr> div <tile size>
|
||||
/// TileMod: <dim expr> mod <tile size>
|
||||
/// TileNone: None of the above
|
||||
/// Example:
|
||||
/// #tiled_2d_128x256 = affine_map<(d0, d1)
|
||||
/// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
|
||||
/// "d0 div 128" and "d1 div 256" ==> TileFloorDiv
|
||||
/// "d0 mod 128" and "d1 mod 256" ==> TileMod
|
||||
enum TileExprPattern { TileFloorDiv, TileMod, TileNone };
|
||||
|
||||
/// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions
|
||||
/// being floordiv'ed by respective tile sizes appeare in a mod with the same
|
||||
/// tile sizes, and no other expression involves those k dimensions. This
|
||||
/// function stores a vector of tuples (`tileSizePos`) including AffineExpr for
|
||||
/// tile size, positions of corresponding `floordiv` and `mod`. If it is not a
|
||||
/// tiled layout, an empty vector is returned.
|
||||
static LogicalResult getTileSizePos(
|
||||
AffineMap map,
|
||||
SmallVectorImpl<std::tuple<AffineExpr, unsigned, unsigned>> &tileSizePos) {
|
||||
// Create `floordivExprs` which is a vector of tuples including LHS and RHS of
|
||||
// `floordiv` and its position in `map` output.
|
||||
// Example: #tiled_2d_128x256 = affine_map<(d0, d1)
|
||||
// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
|
||||
// In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}.
|
||||
SmallVector<std::tuple<AffineExpr, AffineExpr, unsigned>, 4> floordivExprs;
|
||||
unsigned pos = 0;
|
||||
for (AffineExpr expr : map.getResults()) {
|
||||
if (expr.getKind() == AffineExprKind::FloorDiv) {
|
||||
AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
|
||||
if (binaryExpr.getRHS().isa<AffineConstantExpr>())
|
||||
floordivExprs.emplace_back(
|
||||
std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos));
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
// Not tiled layout if `floordivExprs` is empty.
|
||||
if (floordivExprs.empty()) {
|
||||
tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
|
||||
return success();
|
||||
}
|
||||
|
||||
// Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is
|
||||
// not tiled layout.
|
||||
for (std::tuple<AffineExpr, AffineExpr, unsigned> fexpr : floordivExprs) {
|
||||
AffineExpr floordivExprLHS = std::get<0>(fexpr);
|
||||
AffineExpr floordivExprRHS = std::get<1>(fexpr);
|
||||
unsigned floordivPos = std::get<2>(fexpr);
|
||||
|
||||
// Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS
|
||||
// of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used
|
||||
// other expr, the map is not tiled layout. Example of non tiled layout:
|
||||
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)>
|
||||
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)>
|
||||
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod
|
||||
// 256)>
|
||||
bool found = false;
|
||||
pos = 0;
|
||||
for (AffineExpr expr : map.getResults()) {
|
||||
bool notTiled = false;
|
||||
if (pos != floordivPos) {
|
||||
expr.walk([&](AffineExpr e) {
|
||||
if (e == floordivExprLHS) {
|
||||
if (expr.getKind() == AffineExprKind::Mod) {
|
||||
AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
|
||||
// If LHS and RHS of `mod` are the same with those of floordiv.
|
||||
if (floordivExprLHS == binaryExpr.getLHS() &&
|
||||
floordivExprRHS == binaryExpr.getRHS()) {
|
||||
// Save tile size (RHS of `mod`), and position of `floordiv` and
|
||||
// `mod` if same expr with `mod` is not found yet.
|
||||
if (!found) {
|
||||
tileSizePos.emplace_back(
|
||||
std::make_tuple(binaryExpr.getRHS(), floordivPos, pos));
|
||||
found = true;
|
||||
} else {
|
||||
// Non tiled layout: Have multilpe `mod` with the same LHS.
|
||||
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
|
||||
// mod 256, d2 mod 256)>
|
||||
notTiled = true;
|
||||
}
|
||||
} else {
|
||||
// Non tiled layout: RHS of `mod` is different from `floordiv`.
|
||||
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
|
||||
// mod 128)>
|
||||
notTiled = true;
|
||||
}
|
||||
} else {
|
||||
// Non tiled layout: LHS is the same, but not `mod`.
|
||||
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
|
||||
// floordiv 256)>
|
||||
notTiled = true;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
if (notTiled) {
|
||||
tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
|
||||
return success();
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic
|
||||
/// after normalization. Dimensions that include dynamic dimensions in the map
|
||||
/// output will become dynamic dimensions. Return true if `dim` is dynamic
|
||||
/// dimension.
|
||||
///
|
||||
/// Example:
|
||||
/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
|
||||
///
|
||||
/// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic.
|
||||
/// memref<4x?xf32, #map0> ==> memref<4x?x?xf32>
|
||||
static bool
|
||||
isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap,
|
||||
SmallVectorImpl<unsigned> &inMemrefTypeDynDims,
|
||||
MLIRContext *context) {
|
||||
bool isDynamicDim = false;
|
||||
AffineExpr expr = layoutMap.getResults()[dim];
|
||||
// Check if affine expr of the dimension includes dynamic dimension of input
|
||||
// memrefType.
|
||||
expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) {
|
||||
if (e.isa<AffineDimExpr>()) {
|
||||
for (unsigned dm : inMemrefTypeDynDims) {
|
||||
if (e == getAffineDimExpr(dm, context)) {
|
||||
isDynamicDim = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
return isDynamicDim;
|
||||
}
|
||||
|
||||
/// Create affine expr to calculate dimension size for a tiled-layout map.
|
||||
static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput,
|
||||
TileExprPattern pat) {
|
||||
// Create map output for the patterns.
|
||||
// "floordiv <tile size>" ==> "ceildiv <tile size>"
|
||||
// "mod <tile size>" ==> "<tile size>"
|
||||
AffineExpr newMapOutput;
|
||||
AffineBinaryOpExpr binaryExpr = nullptr;
|
||||
switch (pat) {
|
||||
case TileExprPattern::TileMod:
|
||||
binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
|
||||
newMapOutput = binaryExpr.getRHS();
|
||||
break;
|
||||
case TileExprPattern::TileFloorDiv:
|
||||
binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
|
||||
newMapOutput = getAffineBinaryOpExpr(
|
||||
AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS());
|
||||
break;
|
||||
default:
|
||||
newMapOutput = oldMapOutput;
|
||||
}
|
||||
return newMapOutput;
|
||||
}
|
||||
|
||||
/// Create new maps to calculate each dimension size of `newMemRefType`, and
|
||||
/// create `newDynamicSizes` from them by using AffineApplyOp.
|
||||
///
|
||||
/// Steps for normalizing dynamic memrefs for a tiled layout map
|
||||
/// Example:
|
||||
/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
|
||||
/// %0 = dim %arg0, %c1 :memref<4x?xf32>
|
||||
/// %1 = alloc(%0) : memref<4x?xf32, #map0>
|
||||
///
|
||||
/// (Before this function)
|
||||
/// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only
|
||||
/// single layout map is supported.
|
||||
///
|
||||
/// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It
|
||||
/// is memref<4x?x?xf32> in the above example.
|
||||
///
|
||||
/// (In this function)
|
||||
/// 3. Create new maps to calculate each dimension of the normalized memrefType
|
||||
/// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the
|
||||
/// dimension size can be calculated by replacing "floordiv <tile size>" with
|
||||
/// "ceildiv <tile size>" and "mod <tile size>" with "<tile size>".
|
||||
/// - New map in the above example
|
||||
/// #map0 = affine_map<(d0, d1) -> (d0)>
|
||||
/// #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)>
|
||||
/// #map2 = affine_map<(d0, d1) -> (32)>
|
||||
///
|
||||
/// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp
|
||||
/// is used in dynamicSizes of new AllocOp.
|
||||
/// %0 = dim %arg0, %c1 : memref<4x?xf32>
|
||||
/// %c4 = arith.constant 4 : index
|
||||
/// %1 = affine.apply #map1(%c4, %0)
|
||||
/// %2 = affine.apply #map2(%c4, %0)
|
||||
static void createNewDynamicSizes(MemRefType oldMemRefType,
|
||||
MemRefType newMemRefType, AffineMap map,
|
||||
memref::AllocOp *allocOp, OpBuilder b,
|
||||
SmallVectorImpl<Value> &newDynamicSizes) {
|
||||
// Create new input for AffineApplyOp.
|
||||
SmallVector<Value, 4> inAffineApply;
|
||||
ArrayRef<int64_t> oldMemRefShape = oldMemRefType.getShape();
|
||||
unsigned dynIdx = 0;
|
||||
for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) {
|
||||
if (oldMemRefShape[d] < 0) {
|
||||
// Use dynamicSizes of allocOp for dynamic dimension.
|
||||
inAffineApply.emplace_back(allocOp->dynamicSizes()[dynIdx]);
|
||||
dynIdx++;
|
||||
} else {
|
||||
// Create ConstantOp for static dimension.
|
||||
Attribute constantAttr =
|
||||
b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]);
|
||||
inAffineApply.emplace_back(
|
||||
b.create<arith::ConstantOp>(allocOp->getLoc(), constantAttr));
|
||||
}
|
||||
}
|
||||
|
||||
// Create new map to calculate each dimension size of new memref for each
|
||||
// original map output. Only for dynamic dimesion of `newMemRefType`.
|
||||
unsigned newDimIdx = 0;
|
||||
ArrayRef<int64_t> newMemRefShape = newMemRefType.getShape();
|
||||
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
|
||||
(void)getTileSizePos(map, tileSizePos);
|
||||
for (AffineExpr expr : map.getResults()) {
|
||||
if (newMemRefShape[newDimIdx] < 0) {
|
||||
// Create new maps to calculate each dimension size of new memref.
|
||||
enum TileExprPattern pat = TileExprPattern::TileNone;
|
||||
for (auto pos : tileSizePos) {
|
||||
if (newDimIdx == std::get<1>(pos))
|
||||
pat = TileExprPattern::TileFloorDiv;
|
||||
else if (newDimIdx == std::get<2>(pos))
|
||||
pat = TileExprPattern::TileMod;
|
||||
}
|
||||
AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat);
|
||||
AffineMap newMap =
|
||||
AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput);
|
||||
Value affineApp =
|
||||
b.create<AffineApplyOp>(allocOp->getLoc(), newMap, inAffineApply);
|
||||
newDynamicSizes.emplace_back(affineApp);
|
||||
}
|
||||
newDimIdx++;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Currently works for static memrefs with a single layout map.
|
||||
LogicalResult mlir::normalizeMemRef(memref::AllocOp *allocOp) {
|
||||
MemRefType memrefType = allocOp->getType();
|
||||
OpBuilder b(*allocOp);
|
||||
|
||||
// Fetch a new memref type after normalizing the old memref to have an
|
||||
// identity map layout.
|
||||
MemRefType newMemRefType =
|
||||
normalizeMemRefType(memrefType, b, allocOp->symbolOperands().size());
|
||||
if (newMemRefType == memrefType)
|
||||
// Either memrefType already had an identity map or the map couldn't be
|
||||
// transformed to an identity map.
|
||||
return failure();
|
||||
|
||||
Value oldMemRef = allocOp->getResult();
|
||||
|
||||
SmallVector<Value, 4> symbolOperands(allocOp->symbolOperands());
|
||||
AffineMap layoutMap = memrefType.getLayout().getAffineMap();
|
||||
memref::AllocOp newAlloc;
|
||||
// Check if `layoutMap` is a tiled layout. Only single layout map is
|
||||
// supported for normalizing dynamic memrefs.
|
||||
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
|
||||
(void)getTileSizePos(layoutMap, tileSizePos);
|
||||
if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) {
|
||||
MemRefType oldMemRefType = oldMemRef.getType().cast<MemRefType>();
|
||||
SmallVector<Value, 4> newDynamicSizes;
|
||||
createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b,
|
||||
newDynamicSizes);
|
||||
// Add the new dynamic sizes in new AllocOp.
|
||||
newAlloc =
|
||||
b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
|
||||
newDynamicSizes, allocOp->alignmentAttr());
|
||||
} else {
|
||||
newAlloc = b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
|
||||
allocOp->alignmentAttr());
|
||||
}
|
||||
// Replace all uses of the old memref.
|
||||
if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc,
|
||||
/*extraIndices=*/{},
|
||||
/*indexRemap=*/layoutMap,
|
||||
/*extraOperands=*/{},
|
||||
/*symbolOperands=*/symbolOperands,
|
||||
/*domOpFilter=*/nullptr,
|
||||
/*postDomOpFilter=*/nullptr,
|
||||
/*allowNonDereferencingOps=*/true))) {
|
||||
// If it failed (due to escapes for example), bail out.
|
||||
newAlloc.erase();
|
||||
return failure();
|
||||
}
|
||||
// Replace any uses of the original alloc op and erase it. All remaining uses
|
||||
// have to be dealloc's; RAMUW above would've failed otherwise.
|
||||
assert(llvm::all_of(oldMemRef.getUsers(), [](Operation *op) {
|
||||
return isa<memref::DeallocOp>(op);
|
||||
}));
|
||||
oldMemRef.replaceAllUsesWith(newAlloc);
|
||||
allocOp->erase();
|
||||
return success();
|
||||
}
|
||||
|
||||
MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b,
|
||||
unsigned numSymbolicOperands) {
|
||||
unsigned rank = memrefType.getRank();
|
||||
if (rank == 0)
|
||||
return memrefType;
|
||||
|
||||
if (memrefType.getLayout().isIdentity()) {
|
||||
// Either no maps is associated with this memref or this memref has
|
||||
// a trivial (identity) map.
|
||||
return memrefType;
|
||||
}
|
||||
AffineMap layoutMap = memrefType.getLayout().getAffineMap();
|
||||
|
||||
// We don't do any checks for one-to-one'ness; we assume that it is
|
||||
// one-to-one.
|
||||
|
||||
// Normalize only static memrefs and dynamic memrefs with a tiled-layout map
|
||||
// for now.
|
||||
// TODO: Normalize the other types of dynamic memrefs.
|
||||
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
|
||||
(void)getTileSizePos(layoutMap, tileSizePos);
|
||||
if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty())
|
||||
return memrefType;
|
||||
|
||||
// We have a single map that is not an identity map. Create a new memref
|
||||
// with the right shape and an identity layout map.
|
||||
ArrayRef<int64_t> shape = memrefType.getShape();
|
||||
// FlatAffineConstraint may later on use symbolicOperands.
|
||||
FlatAffineConstraints fac(rank, numSymbolicOperands);
|
||||
SmallVector<unsigned, 4> memrefTypeDynDims;
|
||||
for (unsigned d = 0; d < rank; ++d) {
|
||||
// Use constraint system only in static dimensions.
|
||||
if (shape[d] > 0) {
|
||||
fac.addBound(FlatAffineConstraints::LB, d, 0);
|
||||
fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1);
|
||||
} else {
|
||||
memrefTypeDynDims.emplace_back(d);
|
||||
}
|
||||
}
|
||||
// We compose this map with the original index (logical) space to derive
|
||||
// the upper bounds for the new index space.
|
||||
unsigned newRank = layoutMap.getNumResults();
|
||||
if (failed(fac.composeMatchingMap(layoutMap)))
|
||||
return memrefType;
|
||||
// TODO: Handle semi-affine maps.
|
||||
// Project out the old data dimensions.
|
||||
fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds());
|
||||
SmallVector<int64_t, 4> newShape(newRank);
|
||||
for (unsigned d = 0; d < newRank; ++d) {
|
||||
// Check if each dimension of normalized memrefType is dynamic.
|
||||
bool isDynDim = isNormalizedMemRefDynamicDim(
|
||||
d, layoutMap, memrefTypeDynDims, b.getContext());
|
||||
if (isDynDim) {
|
||||
newShape[d] = -1;
|
||||
} else {
|
||||
// The lower bound for the shape is always zero.
|
||||
auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d);
|
||||
// For a static memref and an affine map with no symbols, this is
|
||||
// always bounded.
|
||||
assert(ubConst.hasValue() && "should always have an upper bound");
|
||||
if (ubConst.getValue() < 0)
|
||||
// This is due to an invalid map that maps to a negative space.
|
||||
return memrefType;
|
||||
// If dimension of new memrefType is dynamic, the value is -1.
|
||||
newShape[d] = ubConst.getValue() + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Create the new memref type after trivializing the old layout map.
|
||||
MemRefType newMemRefType =
|
||||
MemRefType::Builder(memrefType)
|
||||
.setShape(newShape)
|
||||
.setLayout(AffineMapAttr::get(b.getMultiDimIdentityMap(newRank)));
|
||||
|
||||
return newMemRefType;
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/GPU/MemoryPromotion.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
@@ -19,7 +20,6 @@
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/IR/ImplicitLocOpBuilder.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::gpu;
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
|
||||
|
||||
#include "mlir/Dialect/Linalg/Passes.h"
|
||||
#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
|
||||
#include "mlir/Dialect/SCF/Transforms.h"
|
||||
@@ -20,7 +19,6 @@
|
||||
#include "mlir/Dialect/Vector/VectorTransforms.h"
|
||||
#include "mlir/Pass/PassManager.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
|
||||
#include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
|
||||
#include "mlir/Analysis/SliceAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
||||
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
|
||||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
@@ -24,7 +23,6 @@
|
||||
#include "mlir/IR/AsmState.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/Dominance.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
#include "mlir/Analysis/SliceAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
||||
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
|
||||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
@@ -27,7 +26,6 @@
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/Dominance.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
#include "PassDetail.h"
|
||||
#include "mlir/Analysis/SliceAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
||||
#include "mlir/Dialect/Linalg/Passes.h"
|
||||
#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
|
||||
@@ -29,9 +31,7 @@
|
||||
#include "mlir/Pass/PassManager.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::vector;
|
||||
@@ -348,7 +348,13 @@ struct LinalgStrategyEnablePass
|
||||
return signalPassFailure();
|
||||
}
|
||||
|
||||
promoteSingleIterationLoops(funcOp);
|
||||
// Gathers all innermost loops through a post order pruned walk.
|
||||
funcOp.walk([](Operation *op) {
|
||||
if (auto forOp = dyn_cast<AffineForOp>(op))
|
||||
(void)promoteIfSingleIteration(forOp);
|
||||
else if (auto forOp = dyn_cast<scf::ForOp>(op))
|
||||
(void)promoteIfSingleIteration(forOp);
|
||||
});
|
||||
if (options.hoistRedundantVectorTransfers)
|
||||
hoistRedundantVectorTransfers(funcOp);
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
|
||||
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
@@ -31,7 +32,6 @@
|
||||
#include "mlir/IR/Matchers.h"
|
||||
#include "mlir/IR/OpImplementation.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "llvm/ADT/TypeSwitch.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
|
||||
@@ -13,9 +13,9 @@
|
||||
|
||||
#include "PassDetail.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
|
||||
LoopPipelining.cpp
|
||||
LoopRangeFolding.cpp
|
||||
LoopSpecialization.cpp
|
||||
ParallelLoopCollapsing.cpp
|
||||
ParallelLoopFusion.cpp
|
||||
ParallelLoopTiling.cpp
|
||||
StructuralTypeConversions.cpp
|
||||
|
||||
@@ -7,9 +7,9 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "PassDetail.h"
|
||||
#include "mlir/Dialect/SCF/Passes.h"
|
||||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Dialect/SCF/Utils.h"
|
||||
#include "mlir/Transforms/RegionUtils.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
@@ -20,7 +20,7 @@ using namespace mlir;
|
||||
|
||||
namespace {
|
||||
struct ParallelLoopCollapsing
|
||||
: public ParallelLoopCollapsingBase<ParallelLoopCollapsing> {
|
||||
: public SCFParallelLoopCollapsingBase<ParallelLoopCollapsing> {
|
||||
void runOnOperation() override {
|
||||
Operation *module = getOperation();
|
||||
|
||||
@@ -11,20 +11,31 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/SCF/Utils.h"
|
||||
|
||||
#include "mlir/Analysis/SliceAnalysis.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/IR/BlockAndValueMapping.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/Support/MathExtras.h"
|
||||
#include "mlir/Transforms/RegionUtils.h"
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace {
|
||||
// This structure is to pass and return sets of loop parameters without
|
||||
// confusing the order.
|
||||
struct LoopParams {
|
||||
Value lowerBound;
|
||||
Value upperBound;
|
||||
Value step;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
scf::ForOp mlir::cloneWithNewYields(OpBuilder &b, scf::ForOp loop,
|
||||
ValueRange newIterOperands,
|
||||
ValueRange newYieldedValues,
|
||||
@@ -230,3 +241,682 @@ bool mlir::getInnermostParallelLoops(Operation *rootOp,
|
||||
}
|
||||
return rootEnclosesPloops;
|
||||
}
|
||||
|
||||
// Build the IR that performs ceil division of a positive value by a constant:
|
||||
// ceildiv(a, B) = divis(a + (B-1), B)
|
||||
// where divis is rounding-to-zero division.
|
||||
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
|
||||
int64_t divisor) {
|
||||
assert(divisor > 0 && "expected positive divisor");
|
||||
assert(dividend.getType().isIndex() && "expected index-typed value");
|
||||
|
||||
Value divisorMinusOneCst =
|
||||
builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
|
||||
Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
|
||||
Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
|
||||
return builder.create<arith::DivSIOp>(loc, sum, divisorCst);
|
||||
}
|
||||
|
||||
// Build the IR that performs ceil division of a positive value by another
|
||||
// positive value:
|
||||
// ceildiv(a, b) = divis(a + (b - 1), b)
|
||||
// where divis is rounding-to-zero division.
|
||||
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
|
||||
Value divisor) {
|
||||
assert(dividend.getType().isIndex() && "expected index-typed value");
|
||||
|
||||
Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
|
||||
Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
|
||||
Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
|
||||
return builder.create<arith::DivSIOp>(loc, sum, divisor);
|
||||
}
|
||||
|
||||
/// Helper to replace uses of loop carried values (iter_args) and loop
|
||||
/// yield values while promoting single iteration scf.for ops.
|
||||
static void replaceIterArgsAndYieldResults(scf::ForOp forOp) {
|
||||
// Replace uses of iter arguments with iter operands (initial values).
|
||||
auto iterOperands = forOp.getIterOperands();
|
||||
auto iterArgs = forOp.getRegionIterArgs();
|
||||
for (auto e : llvm::zip(iterOperands, iterArgs))
|
||||
std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
|
||||
|
||||
// Replace uses of loop results with the values yielded by the loop.
|
||||
auto outerResults = forOp.getResults();
|
||||
auto innerResults = forOp.getBody()->getTerminator()->getOperands();
|
||||
for (auto e : llvm::zip(outerResults, innerResults))
|
||||
std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
|
||||
}
|
||||
|
||||
/// Promotes the loop body of a forOp to its containing block if the forOp
|
||||
/// it can be determined that the loop has a single iteration.
|
||||
LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) {
|
||||
auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
|
||||
auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
|
||||
auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
|
||||
if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 ||
|
||||
ubCstOp.value() < 0 || stepCstOp.value() < 0)
|
||||
return failure();
|
||||
int64_t tripCount =
|
||||
mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value());
|
||||
if (tripCount != 1)
|
||||
return failure();
|
||||
auto iv = forOp.getInductionVar();
|
||||
iv.replaceAllUsesWith(lbCstOp);
|
||||
|
||||
replaceIterArgsAndYieldResults(forOp);
|
||||
|
||||
// Move the loop body operations, except for its terminator, to the loop's
|
||||
// containing block.
|
||||
auto *parentBlock = forOp->getBlock();
|
||||
forOp.getBody()->getTerminator()->erase();
|
||||
parentBlock->getOperations().splice(Block::iterator(forOp),
|
||||
forOp.getBody()->getOperations());
|
||||
forOp.erase();
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
|
||||
/// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
|
||||
/// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
|
||||
/// unrolled iteration using annotateFn.
|
||||
static void generateUnrolledLoop(
|
||||
Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
|
||||
function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
|
||||
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
|
||||
ValueRange iterArgs, ValueRange yieldedValues) {
|
||||
// Builder to insert unrolled bodies just before the terminator of the body of
|
||||
// 'forOp'.
|
||||
auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
|
||||
|
||||
if (!annotateFn)
|
||||
annotateFn = [](unsigned, Operation *, OpBuilder) {};
|
||||
|
||||
// Keep a pointer to the last non-terminator operation in the original block
|
||||
// so that we know what to clone (since we are doing this in-place).
|
||||
Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
|
||||
|
||||
// Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
|
||||
SmallVector<Value, 4> lastYielded(yieldedValues);
|
||||
|
||||
for (unsigned i = 1; i < unrollFactor; i++) {
|
||||
BlockAndValueMapping operandMap;
|
||||
|
||||
// Prepare operand map.
|
||||
operandMap.map(iterArgs, lastYielded);
|
||||
|
||||
// If the induction variable is used, create a remapping to the value for
|
||||
// this unrolled instance.
|
||||
if (!forOpIV.use_empty()) {
|
||||
Value ivUnroll = ivRemapFn(i, forOpIV, builder);
|
||||
operandMap.map(forOpIV, ivUnroll);
|
||||
}
|
||||
|
||||
// Clone the original body of 'forOp'.
|
||||
for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
|
||||
Operation *clonedOp = builder.clone(*it, operandMap);
|
||||
annotateFn(i, clonedOp, builder);
|
||||
}
|
||||
|
||||
// Update yielded values.
|
||||
for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
|
||||
lastYielded[i] = operandMap.lookup(yieldedValues[i]);
|
||||
}
|
||||
|
||||
// Make sure we annotate the Ops in the original body. We do this last so that
|
||||
// any annotations are not copied into the cloned Ops above.
|
||||
for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
|
||||
annotateFn(0, &*it, builder);
|
||||
|
||||
// Update operands of the yield statement.
|
||||
loopBodyBlock->getTerminator()->setOperands(lastYielded);
|
||||
}
|
||||
|
||||
/// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
|
||||
LogicalResult mlir::loopUnrollByFactor(
|
||||
scf::ForOp forOp, uint64_t unrollFactor,
|
||||
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
|
||||
assert(unrollFactor > 0 && "expected positive unroll factor");
|
||||
|
||||
// Return if the loop body is empty.
|
||||
if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
|
||||
return success();
|
||||
|
||||
// Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
|
||||
// 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
|
||||
OpBuilder boundsBuilder(forOp);
|
||||
auto loc = forOp.getLoc();
|
||||
auto step = forOp.getStep();
|
||||
Value upperBoundUnrolled;
|
||||
Value stepUnrolled;
|
||||
bool generateEpilogueLoop = true;
|
||||
|
||||
auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
|
||||
auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
|
||||
auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
|
||||
if (lbCstOp && ubCstOp && stepCstOp) {
|
||||
// Constant loop bounds computation.
|
||||
int64_t lbCst = lbCstOp.value();
|
||||
int64_t ubCst = ubCstOp.value();
|
||||
int64_t stepCst = stepCstOp.value();
|
||||
assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
|
||||
"expected positive loop bounds and step");
|
||||
int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
|
||||
|
||||
if (unrollFactor == 1) {
|
||||
if (tripCount == 1 && failed(promoteIfSingleIteration(forOp)))
|
||||
return failure();
|
||||
return success();
|
||||
}
|
||||
|
||||
int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
|
||||
int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
|
||||
assert(upperBoundUnrolledCst <= ubCst);
|
||||
int64_t stepUnrolledCst = stepCst * unrollFactor;
|
||||
|
||||
// Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
|
||||
generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
|
||||
if (generateEpilogueLoop)
|
||||
upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
|
||||
loc, upperBoundUnrolledCst);
|
||||
else
|
||||
upperBoundUnrolled = ubCstOp;
|
||||
|
||||
// Create constant for 'stepUnrolled'.
|
||||
stepUnrolled = stepCst == stepUnrolledCst
|
||||
? step
|
||||
: boundsBuilder.create<arith::ConstantIndexOp>(
|
||||
loc, stepUnrolledCst);
|
||||
} else {
|
||||
// Dynamic loop bounds computation.
|
||||
// TODO: Add dynamic asserts for negative lb/ub/step, or
|
||||
// consider using ceilDiv from AffineApplyExpander.
|
||||
auto lowerBound = forOp.getLowerBound();
|
||||
auto upperBound = forOp.getUpperBound();
|
||||
Value diff =
|
||||
boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
|
||||
Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
|
||||
Value unrollFactorCst =
|
||||
boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
|
||||
Value tripCountRem =
|
||||
boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
|
||||
// Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
|
||||
Value tripCountEvenMultiple =
|
||||
boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
|
||||
// Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
|
||||
upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
|
||||
loc, lowerBound,
|
||||
boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
|
||||
// Scale 'step' by 'unrollFactor'.
|
||||
stepUnrolled =
|
||||
boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
|
||||
}
|
||||
|
||||
// Create epilogue clean up loop starting at 'upperBoundUnrolled'.
|
||||
if (generateEpilogueLoop) {
|
||||
OpBuilder epilogueBuilder(forOp->getContext());
|
||||
epilogueBuilder.setInsertionPoint(forOp->getBlock(),
|
||||
std::next(Block::iterator(forOp)));
|
||||
auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
|
||||
epilogueForOp.setLowerBound(upperBoundUnrolled);
|
||||
|
||||
// Update uses of loop results.
|
||||
auto results = forOp.getResults();
|
||||
auto epilogueResults = epilogueForOp.getResults();
|
||||
auto epilogueIterOperands = epilogueForOp.getIterOperands();
|
||||
|
||||
for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) {
|
||||
std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
|
||||
epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
|
||||
}
|
||||
(void)promoteIfSingleIteration(epilogueForOp);
|
||||
}
|
||||
|
||||
// Create unrolled loop.
|
||||
forOp.setUpperBound(upperBoundUnrolled);
|
||||
forOp.setStep(stepUnrolled);
|
||||
|
||||
auto iterArgs = ValueRange(forOp.getRegionIterArgs());
|
||||
auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
|
||||
|
||||
generateUnrolledLoop(
|
||||
forOp.getBody(), forOp.getInductionVar(), unrollFactor,
|
||||
[&](unsigned i, Value iv, OpBuilder b) {
|
||||
// iv' = iv + step * i;
|
||||
auto stride = b.create<arith::MulIOp>(
|
||||
loc, step, b.create<arith::ConstantIndexOp>(loc, i));
|
||||
return b.create<arith::AddIOp>(loc, iv, stride);
|
||||
},
|
||||
annotateFn, iterArgs, yieldedValues);
|
||||
// Promote the loop body up if this has turned into a single iteration loop.
|
||||
(void)promoteIfSingleIteration(forOp);
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Return the new lower bound, upper bound, and step in that order. Insert any
|
||||
/// additional bounds calculations before the given builder and any additional
|
||||
/// conversion back to the original loop induction value inside the given Block.
|
||||
static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
|
||||
OpBuilder &insideLoopBuilder, Location loc,
|
||||
Value lowerBound, Value upperBound, Value step,
|
||||
Value inductionVar) {
|
||||
// Check if the loop is already known to have a constant zero lower bound or
|
||||
// a constant one step.
|
||||
bool isZeroBased = false;
|
||||
if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>())
|
||||
isZeroBased = ubCst.value() == 0;
|
||||
|
||||
bool isStepOne = false;
|
||||
if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>())
|
||||
isStepOne = stepCst.value() == 1;
|
||||
|
||||
// Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
|
||||
// assuming the step is strictly positive. Update the bounds and the step
|
||||
// of the loop to go from 0 to the number of iterations, if necessary.
|
||||
// TODO: introduce support for negative steps or emit dynamic asserts
|
||||
// on step positivity, whatever gets implemented first.
|
||||
if (isZeroBased && isStepOne)
|
||||
return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
|
||||
/*step=*/step};
|
||||
|
||||
Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
|
||||
Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
|
||||
|
||||
Value newLowerBound =
|
||||
isZeroBased ? lowerBound
|
||||
: boundsBuilder.create<arith::ConstantIndexOp>(loc, 0);
|
||||
Value newStep =
|
||||
isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1);
|
||||
|
||||
// Insert code computing the value of the original loop induction variable
|
||||
// from the "normalized" one.
|
||||
Value scaled =
|
||||
isStepOne
|
||||
? inductionVar
|
||||
: insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
|
||||
Value shifted =
|
||||
isZeroBased
|
||||
? scaled
|
||||
: insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
|
||||
|
||||
SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
|
||||
shifted.getDefiningOp()};
|
||||
inductionVar.replaceAllUsesExcept(shifted, preserve);
|
||||
return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
|
||||
/*step=*/newStep};
|
||||
}
|
||||
|
||||
/// Transform a loop with a strictly positive step
|
||||
/// for %i = %lb to %ub step %s
|
||||
/// into a 0-based loop with step 1
|
||||
/// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
|
||||
/// %i = %ii * %s + %lb
|
||||
/// Insert the induction variable remapping in the body of `inner`, which is
|
||||
/// expected to be either `loop` or another loop perfectly nested under `loop`.
|
||||
/// Insert the definition of new bounds immediate before `outer`, which is
|
||||
/// expected to be either `loop` or its parent in the loop nest.
|
||||
static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
|
||||
OpBuilder builder(outer);
|
||||
OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
|
||||
auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
|
||||
loop.getLowerBound(), loop.getUpperBound(),
|
||||
loop.getStep(), loop.getInductionVar());
|
||||
|
||||
loop.setLowerBound(loopPieces.lowerBound);
|
||||
loop.setUpperBound(loopPieces.upperBound);
|
||||
loop.setStep(loopPieces.step);
|
||||
}
|
||||
|
||||
void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
|
||||
if (loops.size() < 2)
|
||||
return;
|
||||
|
||||
scf::ForOp innermost = loops.back();
|
||||
scf::ForOp outermost = loops.front();
|
||||
|
||||
// 1. Make sure all loops iterate from 0 to upperBound with step 1. This
|
||||
// allows the following code to assume upperBound is the number of iterations.
|
||||
for (auto loop : loops)
|
||||
normalizeLoop(loop, outermost, innermost);
|
||||
|
||||
// 2. Emit code computing the upper bound of the coalesced loop as product
|
||||
// of the number of iterations of all loops.
|
||||
OpBuilder builder(outermost);
|
||||
Location loc = outermost.getLoc();
|
||||
Value upperBound = outermost.getUpperBound();
|
||||
for (auto loop : loops.drop_front())
|
||||
upperBound =
|
||||
builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
|
||||
outermost.setUpperBound(upperBound);
|
||||
|
||||
builder.setInsertionPointToStart(outermost.getBody());
|
||||
|
||||
// 3. Remap induction variables. For each original loop, the value of the
|
||||
// induction variable can be obtained by dividing the induction variable of
|
||||
// the linearized loop by the total number of iterations of the loops nested
|
||||
// in it modulo the number of iterations in this loop (remove the values
|
||||
// related to the outer loops):
|
||||
// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
|
||||
// Compute these iteratively from the innermost loop by creating a "running
|
||||
// quotient" of division by the range.
|
||||
Value previous = outermost.getInductionVar();
|
||||
for (unsigned i = 0, e = loops.size(); i < e; ++i) {
|
||||
unsigned idx = loops.size() - i - 1;
|
||||
if (i != 0)
|
||||
previous = builder.create<arith::DivSIOp>(loc, previous,
|
||||
loops[idx + 1].getUpperBound());
|
||||
|
||||
Value iv = (i == e - 1) ? previous
|
||||
: builder.create<arith::RemSIOp>(
|
||||
loc, previous, loops[idx].getUpperBound());
|
||||
replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
|
||||
loops.back().getRegion());
|
||||
}
|
||||
|
||||
// 4. Move the operations from the innermost just above the second-outermost
|
||||
// loop, delete the extra terminator and the second-outermost loop.
|
||||
scf::ForOp second = loops[1];
|
||||
innermost.getBody()->back().erase();
|
||||
outermost.getBody()->getOperations().splice(
|
||||
Block::iterator(second.getOperation()),
|
||||
innermost.getBody()->getOperations());
|
||||
second.erase();
|
||||
}
|
||||
|
||||
void mlir::collapseParallelLoops(
|
||||
scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
|
||||
OpBuilder outsideBuilder(loops);
|
||||
Location loc = loops.getLoc();
|
||||
|
||||
// Presort combined dimensions.
|
||||
auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
|
||||
for (auto &dims : sortedDimensions)
|
||||
std::sort(dims.begin(), dims.end());
|
||||
|
||||
// Normalize ParallelOp's iteration pattern.
|
||||
SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
|
||||
normalizedUpperBounds;
|
||||
for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
|
||||
OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
|
||||
auto resultBounds =
|
||||
normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
|
||||
loops.getLowerBound()[i], loops.getUpperBound()[i],
|
||||
loops.getStep()[i], loops.getBody()->getArgument(i));
|
||||
|
||||
normalizedLowerBounds.push_back(resultBounds.lowerBound);
|
||||
normalizedUpperBounds.push_back(resultBounds.upperBound);
|
||||
normalizedSteps.push_back(resultBounds.step);
|
||||
}
|
||||
|
||||
// Combine iteration spaces.
|
||||
SmallVector<Value, 3> lowerBounds, upperBounds, steps;
|
||||
auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
|
||||
auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
|
||||
for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
|
||||
Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
|
||||
for (auto idx : sortedDimensions[i]) {
|
||||
newUpperBound = outsideBuilder.create<arith::MulIOp>(
|
||||
loc, newUpperBound, normalizedUpperBounds[idx]);
|
||||
}
|
||||
lowerBounds.push_back(cst0);
|
||||
steps.push_back(cst1);
|
||||
upperBounds.push_back(newUpperBound);
|
||||
}
|
||||
|
||||
// Create new ParallelLoop with conversions to the original induction values.
|
||||
// The loop below uses divisions to get the relevant range of values in the
|
||||
// new induction value that represent each range of the original induction
|
||||
// value. The remainders then determine based on that range, which iteration
|
||||
// of the original induction value this represents. This is a normalized value
|
||||
// that is un-normalized already by the previous logic.
|
||||
auto newPloop = outsideBuilder.create<scf::ParallelOp>(
|
||||
loc, lowerBounds, upperBounds, steps,
|
||||
[&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
|
||||
for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
|
||||
Value previous = ploopIVs[i];
|
||||
unsigned numberCombinedDimensions = combinedDimensions[i].size();
|
||||
// Iterate over all except the last induction value.
|
||||
for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
|
||||
unsigned idx = combinedDimensions[i][j];
|
||||
|
||||
// Determine the current induction value's current loop iteration
|
||||
Value iv = insideBuilder.create<arith::RemSIOp>(
|
||||
loc, previous, normalizedUpperBounds[idx]);
|
||||
replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
|
||||
loops.getRegion());
|
||||
|
||||
// Remove the effect of the current induction value to prepare for
|
||||
// the next value.
|
||||
previous = insideBuilder.create<arith::DivSIOp>(
|
||||
loc, previous, normalizedUpperBounds[idx]);
|
||||
}
|
||||
|
||||
// The final induction value is just the remaining value.
|
||||
unsigned idx = combinedDimensions[i][0];
|
||||
replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
|
||||
previous, loops.getRegion());
|
||||
}
|
||||
});
|
||||
|
||||
// Replace the old loop with the new loop.
|
||||
loops.getBody()->back().erase();
|
||||
newPloop.getBody()->getOperations().splice(
|
||||
Block::iterator(newPloop.getBody()->back()),
|
||||
loops.getBody()->getOperations());
|
||||
loops.erase();
|
||||
}
|
||||
|
||||
// Hoist the ops within `outer` that appear before `inner`.
|
||||
// Such ops include the ops that have been introduced by parametric tiling.
|
||||
// Ops that come from triangular loops (i.e. that belong to the program slice
|
||||
// rooted at `outer`) and ops that have side effects cannot be hoisted.
|
||||
// Return failure when any op fails to hoist.
|
||||
static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
|
||||
SetVector<Operation *> forwardSlice;
|
||||
getForwardSlice(
|
||||
outer.getInductionVar(), &forwardSlice,
|
||||
[&inner](Operation *op) { return op != inner.getOperation(); });
|
||||
LogicalResult status = success();
|
||||
SmallVector<Operation *, 8> toHoist;
|
||||
for (auto &op : outer.getBody()->without_terminator()) {
|
||||
// Stop when encountering the inner loop.
|
||||
if (&op == inner.getOperation())
|
||||
break;
|
||||
// Skip over non-hoistable ops.
|
||||
if (forwardSlice.count(&op) > 0) {
|
||||
status = failure();
|
||||
continue;
|
||||
}
|
||||
// Skip intermediate scf::ForOp, these are not considered a failure.
|
||||
if (isa<scf::ForOp>(op))
|
||||
continue;
|
||||
// Skip other ops with regions.
|
||||
if (op.getNumRegions() > 0) {
|
||||
status = failure();
|
||||
continue;
|
||||
}
|
||||
// Skip if op has side effects.
|
||||
// TODO: loads to immutable memory regions are ok.
|
||||
if (!MemoryEffectOpInterface::hasNoEffect(&op)) {
|
||||
status = failure();
|
||||
continue;
|
||||
}
|
||||
toHoist.push_back(&op);
|
||||
}
|
||||
auto *outerForOp = outer.getOperation();
|
||||
for (auto *op : toHoist)
|
||||
op->moveBefore(outerForOp);
|
||||
return status;
|
||||
}
|
||||
|
||||
// Traverse the interTile and intraTile loops and try to hoist ops such that
|
||||
// bands of perfectly nested loops are isolated.
|
||||
// Return failure if either perfect interTile or perfect intraTile bands cannot
|
||||
// be formed.
|
||||
static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
|
||||
LogicalResult status = success();
|
||||
const Loops &interTile = tileLoops.first;
|
||||
const Loops &intraTile = tileLoops.second;
|
||||
auto size = interTile.size();
|
||||
assert(size == intraTile.size());
|
||||
if (size <= 1)
|
||||
return success();
|
||||
for (unsigned s = 1; s < size; ++s)
|
||||
status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
|
||||
: failure();
|
||||
for (unsigned s = 1; s < size; ++s)
|
||||
status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
|
||||
: failure();
|
||||
return status;
|
||||
}
|
||||
|
||||
/// Collect perfectly nested loops starting from `rootForOps`. Loops are
|
||||
/// perfectly nested if each loop is the first and only non-terminator operation
|
||||
/// in the parent loop. Collect at most `maxLoops` loops and append them to
|
||||
/// `forOps`.
|
||||
template <typename T>
|
||||
static void getPerfectlyNestedLoopsImpl(
|
||||
SmallVectorImpl<T> &forOps, T rootForOp,
|
||||
unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
|
||||
for (unsigned i = 0; i < maxLoops; ++i) {
|
||||
forOps.push_back(rootForOp);
|
||||
Block &body = rootForOp.getRegion().front();
|
||||
if (body.begin() != std::prev(body.end(), 2))
|
||||
return;
|
||||
|
||||
rootForOp = dyn_cast<T>(&body.front());
|
||||
if (!rootForOp)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static Loops stripmineSink(scf::ForOp forOp, Value factor,
|
||||
ArrayRef<scf::ForOp> targets) {
|
||||
auto originalStep = forOp.getStep();
|
||||
auto iv = forOp.getInductionVar();
|
||||
|
||||
OpBuilder b(forOp);
|
||||
forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
|
||||
|
||||
Loops innerLoops;
|
||||
for (auto t : targets) {
|
||||
// Save information for splicing ops out of t when done
|
||||
auto begin = t.getBody()->begin();
|
||||
auto nOps = t.getBody()->getOperations().size();
|
||||
|
||||
// Insert newForOp before the terminator of `t`.
|
||||
auto b = OpBuilder::atBlockTerminator((t.getBody()));
|
||||
Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
|
||||
Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt,
|
||||
forOp.getUpperBound(), stepped);
|
||||
Value ub =
|
||||
b.create<SelectOp>(t.getLoc(), less, forOp.getUpperBound(), stepped);
|
||||
|
||||
// Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
|
||||
auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
|
||||
newForOp.getBody()->getOperations().splice(
|
||||
newForOp.getBody()->getOperations().begin(),
|
||||
t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
|
||||
replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
|
||||
newForOp.getRegion());
|
||||
|
||||
innerLoops.push_back(newForOp);
|
||||
}
|
||||
|
||||
return innerLoops;
|
||||
}
|
||||
|
||||
// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
|
||||
// Returns the new for operation, nested immediately under `target`.
|
||||
template <typename SizeType>
|
||||
static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor,
|
||||
scf::ForOp target) {
|
||||
// TODO: Use cheap structural assertions that targets are nested under
|
||||
// forOp and that targets are not nested under each other when DominanceInfo
|
||||
// exposes the capability. It seems overkill to construct a whole function
|
||||
// dominance tree at this point.
|
||||
auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target));
|
||||
assert(res.size() == 1 && "Expected 1 inner forOp");
|
||||
return res[0];
|
||||
}
|
||||
|
||||
SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,
|
||||
ArrayRef<Value> sizes,
|
||||
ArrayRef<scf::ForOp> targets) {
|
||||
SmallVector<SmallVector<scf::ForOp, 8>, 8> res;
|
||||
SmallVector<scf::ForOp, 8> currentTargets(targets.begin(), targets.end());
|
||||
for (auto it : llvm::zip(forOps, sizes)) {
|
||||
auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
|
||||
res.push_back(step);
|
||||
currentTargets = step;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
|
||||
scf::ForOp target) {
|
||||
SmallVector<scf::ForOp, 8> res;
|
||||
for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) {
|
||||
assert(loops.size() == 1);
|
||||
res.push_back(loops[0]);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
|
||||
// Collect perfectly nested loops. If more size values provided than nested
|
||||
// loops available, truncate `sizes`.
|
||||
SmallVector<scf::ForOp, 4> forOps;
|
||||
forOps.reserve(sizes.size());
|
||||
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
|
||||
if (forOps.size() < sizes.size())
|
||||
sizes = sizes.take_front(forOps.size());
|
||||
|
||||
return ::tile(forOps, sizes, forOps.back());
|
||||
}
|
||||
|
||||
void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
|
||||
scf::ForOp root) {
|
||||
getPerfectlyNestedLoopsImpl(nestedLoops, root);
|
||||
}
|
||||
|
||||
TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
|
||||
ArrayRef<int64_t> sizes) {
|
||||
// Collect perfectly nested loops. If more size values provided than nested
|
||||
// loops available, truncate `sizes`.
|
||||
SmallVector<scf::ForOp, 4> forOps;
|
||||
forOps.reserve(sizes.size());
|
||||
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
|
||||
if (forOps.size() < sizes.size())
|
||||
sizes = sizes.take_front(forOps.size());
|
||||
|
||||
// Compute the tile sizes such that i-th outer loop executes size[i]
|
||||
// iterations. Given that the loop current executes
|
||||
// numIterations = ceildiv((upperBound - lowerBound), step)
|
||||
// iterations, we need to tile with size ceildiv(numIterations, size[i]).
|
||||
SmallVector<Value, 4> tileSizes;
|
||||
tileSizes.reserve(sizes.size());
|
||||
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
|
||||
assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
|
||||
|
||||
auto forOp = forOps[i];
|
||||
OpBuilder builder(forOp);
|
||||
auto loc = forOp.getLoc();
|
||||
Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
|
||||
forOp.getLowerBound());
|
||||
Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
|
||||
Value iterationsPerBlock =
|
||||
ceilDivPositive(builder, loc, numIterations, sizes[i]);
|
||||
tileSizes.push_back(iterationsPerBlock);
|
||||
}
|
||||
|
||||
// Call parametric tiling with the given sizes.
|
||||
auto intraTile = tile(forOps, tileSizes, forOps.back());
|
||||
TileLoops tileLoops = std::make_pair(forOps, intraTile);
|
||||
|
||||
// TODO: for now we just ignore the result of band isolation.
|
||||
// In the future, mapping decisions may be impacted by the ability to
|
||||
// isolate perfectly nested bands.
|
||||
(void)tryIsolateBands(tileLoops);
|
||||
|
||||
return tileLoops;
|
||||
}
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include <type_traits>
|
||||
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
#include <type_traits>
|
||||
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/Vector/VectorTransforms.h"
|
||||
#include "mlir/IR/ImplicitLocOpBuilder.h"
|
||||
#include "mlir/Interfaces/VectorInterfaces.h"
|
||||
|
||||
@@ -7,12 +7,95 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Interfaces/LoopLikeInterface.h"
|
||||
#include "mlir/Interfaces/SideEffectInterfaces.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
#define DEBUG_TYPE "loop-like"
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// LoopLike Interfaces
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// Include the definitions of the loop-like interfaces.
|
||||
#include "mlir/Interfaces/LoopLikeInterface.cpp.inc"
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// LoopLike Utilities
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// Checks whether the given op can be hoisted by checking that
|
||||
// - the op and any of its contained operations do not depend on SSA values
|
||||
// defined inside of the loop (by means of calling definedOutside).
|
||||
// - the op has no side-effects. If sideEffecting is Never, sideeffects of this
|
||||
// op and its nested ops are ignored.
|
||||
static bool canBeHoisted(Operation *op,
|
||||
function_ref<bool(Value)> definedOutside) {
|
||||
// Check that dependencies are defined outside of loop.
|
||||
if (!llvm::all_of(op->getOperands(), definedOutside))
|
||||
return false;
|
||||
// Check whether this op is side-effect free. If we already know that there
|
||||
// can be no side-effects because the surrounding op has claimed so, we can
|
||||
// (and have to) skip this step.
|
||||
if (auto memInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
|
||||
if (!memInterface.hasNoEffect())
|
||||
return false;
|
||||
// If the operation doesn't have side effects and it doesn't recursively
|
||||
// have side effects, it can always be hoisted.
|
||||
if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>())
|
||||
return true;
|
||||
|
||||
// Otherwise, if the operation doesn't provide the memory effect interface
|
||||
// and it doesn't have recursive side effects we treat it conservatively as
|
||||
// side-effecting.
|
||||
} else if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Recurse into the regions for this op and check whether the contained ops
|
||||
// can be hoisted.
|
||||
for (auto ®ion : op->getRegions()) {
|
||||
for (auto &block : region) {
|
||||
for (auto &innerOp : block)
|
||||
if (!canBeHoisted(&innerOp, definedOutside))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
LogicalResult mlir::moveLoopInvariantCode(LoopLikeOpInterface looplike) {
|
||||
auto &loopBody = looplike.getLoopBody();
|
||||
|
||||
// We use two collections here as we need to preserve the order for insertion
|
||||
// and this is easiest.
|
||||
SmallPtrSet<Operation *, 8> willBeMovedSet;
|
||||
SmallVector<Operation *, 8> opsToMove;
|
||||
|
||||
// Helper to check whether an operation is loop invariant wrt. SSA properties.
|
||||
auto isDefinedOutsideOfBody = [&](Value value) {
|
||||
auto *definingOp = value.getDefiningOp();
|
||||
return (definingOp && !!willBeMovedSet.count(definingOp)) ||
|
||||
looplike.isDefinedOutsideOfLoop(value);
|
||||
};
|
||||
|
||||
// Do not use walk here, as we do not want to go into nested regions and hoist
|
||||
// operations from there. These regions might have semantics unknown to this
|
||||
// rewriting. If the nested regions are loops, they will have been processed.
|
||||
for (auto &block : loopBody) {
|
||||
for (auto &op : block.without_terminator()) {
|
||||
if (canBeHoisted(&op, isDefinedOutsideOfBody)) {
|
||||
opsToMove.push_back(&op);
|
||||
willBeMovedSet.insert(&op);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For all instructions that we found to be invariant, move outside of the
|
||||
// loop.
|
||||
LogicalResult result = looplike.moveOutOfLoop(opsToMove);
|
||||
LLVM_DEBUG(looplike.print(llvm::dbgs() << "\n\nModified loop:\n"));
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -6,12 +6,8 @@ add_mlir_library(MLIRTransforms
|
||||
CSE.cpp
|
||||
Inliner.cpp
|
||||
LocationSnapshot.cpp
|
||||
LoopCoalescing.cpp
|
||||
LoopFusion.cpp
|
||||
LoopInvariantCodeMotion.cpp
|
||||
OpStats.cpp
|
||||
ParallelLoopCollapsing.cpp
|
||||
PipelineDataTransfer.cpp
|
||||
SCCP.cpp
|
||||
StripDebugInfo.cpp
|
||||
SymbolDCE.cpp
|
||||
@@ -21,18 +17,13 @@ add_mlir_library(MLIRTransforms
|
||||
${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
|
||||
|
||||
DEPENDS
|
||||
MLIRStandardOpsIncGen
|
||||
MLIRTransformsPassIncGen
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRAffine
|
||||
MLIRAnalysis
|
||||
MLIRCopyOpInterface
|
||||
MLIRLoopLikeInterface
|
||||
MLIRMemRef
|
||||
MLIRSCF
|
||||
MLIRPass
|
||||
MLIRSupport
|
||||
MLIRTransformUtils
|
||||
MLIRVector
|
||||
)
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
#include "mlir/IR/Dominance.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "llvm/ADT/DenseMapInfo.h"
|
||||
#include "llvm/ADT/Hashing.h"
|
||||
#include "llvm/ADT/ScopedHashTable.h"
|
||||
|
||||
@@ -16,8 +16,8 @@
|
||||
#include "PassDetail.h"
|
||||
#include "mlir/IR/Dominance.h"
|
||||
#include "mlir/Interfaces/ControlFlowInterfaces.h"
|
||||
#include "mlir/Transforms/ControlFlowSinkUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
|
||||
@@ -11,13 +11,10 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "PassDetail.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/Interfaces/LoopLikeInterface.h"
|
||||
#include "mlir/Interfaces/SideEffectInterfaces.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
@@ -34,80 +31,6 @@ struct LoopInvariantCodeMotion
|
||||
};
|
||||
} // namespace
|
||||
|
||||
// Checks whether the given op can be hoisted by checking that
|
||||
// - the op and any of its contained operations do not depend on SSA values
|
||||
// defined inside of the loop (by means of calling definedOutside).
|
||||
// - the op has no side-effects. If sideEffecting is Never, sideeffects of this
|
||||
// op and its nested ops are ignored.
|
||||
static bool canBeHoisted(Operation *op,
|
||||
function_ref<bool(Value)> definedOutside) {
|
||||
// Check that dependencies are defined outside of loop.
|
||||
if (!llvm::all_of(op->getOperands(), definedOutside))
|
||||
return false;
|
||||
// Check whether this op is side-effect free. If we already know that there
|
||||
// can be no side-effects because the surrounding op has claimed so, we can
|
||||
// (and have to) skip this step.
|
||||
if (auto memInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
|
||||
if (!memInterface.hasNoEffect())
|
||||
return false;
|
||||
// If the operation doesn't have side effects and it doesn't recursively
|
||||
// have side effects, it can always be hoisted.
|
||||
if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>())
|
||||
return true;
|
||||
|
||||
// Otherwise, if the operation doesn't provide the memory effect interface
|
||||
// and it doesn't have recursive side effects we treat it conservatively as
|
||||
// side-effecting.
|
||||
} else if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Recurse into the regions for this op and check whether the contained ops
|
||||
// can be hoisted.
|
||||
for (auto ®ion : op->getRegions()) {
|
||||
for (auto &block : region) {
|
||||
for (auto &innerOp : block)
|
||||
if (!canBeHoisted(&innerOp, definedOutside))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
LogicalResult mlir::moveLoopInvariantCode(LoopLikeOpInterface looplike) {
|
||||
auto &loopBody = looplike.getLoopBody();
|
||||
|
||||
// We use two collections here as we need to preserve the order for insertion
|
||||
// and this is easiest.
|
||||
SmallPtrSet<Operation *, 8> willBeMovedSet;
|
||||
SmallVector<Operation *, 8> opsToMove;
|
||||
|
||||
// Helper to check whether an operation is loop invariant wrt. SSA properties.
|
||||
auto isDefinedOutsideOfBody = [&](Value value) {
|
||||
auto *definingOp = value.getDefiningOp();
|
||||
return (definingOp && !!willBeMovedSet.count(definingOp)) ||
|
||||
looplike.isDefinedOutsideOfLoop(value);
|
||||
};
|
||||
|
||||
// Do not use walk here, as we do not want to go into nested regions and hoist
|
||||
// operations from there. These regions might have semantics unknown to this
|
||||
// rewriting. If the nested regions are loops, they will have been processed.
|
||||
for (auto &block : loopBody) {
|
||||
for (auto &op : block.without_terminator()) {
|
||||
if (canBeHoisted(&op, isDefinedOutsideOfBody)) {
|
||||
opsToMove.push_back(&op);
|
||||
willBeMovedSet.insert(&op);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For all instructions that we found to be invariant, move outside of the
|
||||
// loop.
|
||||
auto result = looplike.moveOutOfLoop(opsToMove);
|
||||
LLVM_DEBUG(looplike.print(llvm::dbgs() << "\n\nModified loop:\n"));
|
||||
return result;
|
||||
}
|
||||
|
||||
void LoopInvariantCodeMotion::runOnOperation() {
|
||||
// Walk through all loops in a function in innermost-loop-first order. This
|
||||
// way, we first LICM from the inner loop, and place the ops in
|
||||
|
||||
@@ -13,23 +13,8 @@
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
namespace mlir {
|
||||
class AffineDialect;
|
||||
|
||||
// Forward declaration from Dialect.h
|
||||
template <typename ConcreteDialect>
|
||||
void registerDialect(DialectRegistry ®istry);
|
||||
|
||||
namespace arith {
|
||||
class ArithmeticDialect;
|
||||
} // namespace arith
|
||||
|
||||
namespace memref {
|
||||
class MemRefDialect;
|
||||
} // namespace memref
|
||||
|
||||
#define GEN_PASS_CLASSES
|
||||
#include "mlir/Transforms/Passes.h.inc"
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // TRANSFORMS_PASSDETAIL_H_
|
||||
|
||||
@@ -4,25 +4,12 @@ add_mlir_library(MLIRTransformUtils
|
||||
FoldUtils.cpp
|
||||
GreedyPatternRewriteDriver.cpp
|
||||
InliningUtils.cpp
|
||||
LoopFusionUtils.cpp
|
||||
LoopUtils.cpp
|
||||
RegionUtils.cpp
|
||||
Utils.cpp
|
||||
|
||||
ADDITIONAL_HEADER_DIRS
|
||||
${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
|
||||
|
||||
DEPENDS
|
||||
MLIRStandardOpsIncGen
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRAffine
|
||||
MLIRArithmetic
|
||||
MLIRAnalysis
|
||||
MLIRAffineAnalysis
|
||||
MLIRMemRef
|
||||
MLIRSCF
|
||||
MLIRPass
|
||||
MLIRRewrite
|
||||
MLIRStandard
|
||||
)
|
||||
|
||||
@@ -18,10 +18,10 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Transforms/ControlFlowSinkUtils.h"
|
||||
#include "mlir/IR/Dominance.h"
|
||||
#include "mlir/IR/Matchers.h"
|
||||
#include "mlir/Interfaces/ControlFlowInterfaces.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include <vector>
|
||||
|
||||
#define DEBUG_TYPE "cf-sink"
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/FunctionInterfaces.h"
|
||||
#include "mlir/Rewrite/PatternApplicator.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "llvm/ADT/ScopeExit.h"
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
|
||||
@@ -1,767 +0,0 @@
|
||||
//===- Utils.cpp ---- Misc utilities for code and data transformation -----===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements miscellaneous transformation routines for non-loop IR
|
||||
// structures.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/Dominance.h"
|
||||
#include "mlir/Support/MathExtras.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/TypeSwitch.h"
|
||||
|
||||
#define DEBUG_TYPE "transforms-utils"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
// Perform the replacement in `op`.
|
||||
LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
|
||||
Operation *op,
|
||||
ArrayRef<Value> extraIndices,
|
||||
AffineMap indexRemap,
|
||||
ArrayRef<Value> extraOperands,
|
||||
ArrayRef<Value> symbolOperands,
|
||||
bool allowNonDereferencingOps) {
|
||||
unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
|
||||
(void)newMemRefRank; // unused in opt mode
|
||||
unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
|
||||
(void)oldMemRefRank; // unused in opt mode
|
||||
if (indexRemap) {
|
||||
assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
|
||||
"symbolic operand count mismatch");
|
||||
assert(indexRemap.getNumInputs() ==
|
||||
extraOperands.size() + oldMemRefRank + symbolOperands.size());
|
||||
assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
|
||||
} else {
|
||||
assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
|
||||
}
|
||||
|
||||
// Assert same elemental type.
|
||||
assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
|
||||
newMemRef.getType().cast<MemRefType>().getElementType());
|
||||
|
||||
SmallVector<unsigned, 2> usePositions;
|
||||
for (const auto &opEntry : llvm::enumerate(op->getOperands())) {
|
||||
if (opEntry.value() == oldMemRef)
|
||||
usePositions.push_back(opEntry.index());
|
||||
}
|
||||
|
||||
// If memref doesn't appear, nothing to do.
|
||||
if (usePositions.empty())
|
||||
return success();
|
||||
|
||||
if (usePositions.size() > 1) {
|
||||
// TODO: extend it for this case when needed (rare).
|
||||
assert(false && "multiple dereferencing uses in a single op not supported");
|
||||
return failure();
|
||||
}
|
||||
|
||||
unsigned memRefOperandPos = usePositions.front();
|
||||
|
||||
OpBuilder builder(op);
|
||||
// The following checks if op is dereferencing memref and performs the access
|
||||
// index rewrites.
|
||||
auto affMapAccInterface = dyn_cast<AffineMapAccessInterface>(op);
|
||||
if (!affMapAccInterface) {
|
||||
if (!allowNonDereferencingOps) {
|
||||
// Failure: memref used in a non-dereferencing context (potentially
|
||||
// escapes); no replacement in these cases unless allowNonDereferencingOps
|
||||
// is set.
|
||||
return failure();
|
||||
}
|
||||
op->setOperand(memRefOperandPos, newMemRef);
|
||||
return success();
|
||||
}
|
||||
// Perform index rewrites for the dereferencing op and then replace the op
|
||||
NamedAttribute oldMapAttrPair =
|
||||
affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef);
|
||||
AffineMap oldMap = oldMapAttrPair.getValue().cast<AffineMapAttr>().getValue();
|
||||
unsigned oldMapNumInputs = oldMap.getNumInputs();
|
||||
SmallVector<Value, 4> oldMapOperands(
|
||||
op->operand_begin() + memRefOperandPos + 1,
|
||||
op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs);
|
||||
|
||||
// Apply 'oldMemRefOperands = oldMap(oldMapOperands)'.
|
||||
SmallVector<Value, 4> oldMemRefOperands;
|
||||
SmallVector<Value, 4> affineApplyOps;
|
||||
oldMemRefOperands.reserve(oldMemRefRank);
|
||||
if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) {
|
||||
for (auto resultExpr : oldMap.getResults()) {
|
||||
auto singleResMap = AffineMap::get(oldMap.getNumDims(),
|
||||
oldMap.getNumSymbols(), resultExpr);
|
||||
auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
|
||||
oldMapOperands);
|
||||
oldMemRefOperands.push_back(afOp);
|
||||
affineApplyOps.push_back(afOp);
|
||||
}
|
||||
} else {
|
||||
oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end());
|
||||
}
|
||||
|
||||
// Construct new indices as a remap of the old ones if a remapping has been
|
||||
// provided. The indices of a memref come right after it, i.e.,
|
||||
// at position memRefOperandPos + 1.
|
||||
SmallVector<Value, 4> remapOperands;
|
||||
remapOperands.reserve(extraOperands.size() + oldMemRefRank +
|
||||
symbolOperands.size());
|
||||
remapOperands.append(extraOperands.begin(), extraOperands.end());
|
||||
remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end());
|
||||
remapOperands.append(symbolOperands.begin(), symbolOperands.end());
|
||||
|
||||
SmallVector<Value, 4> remapOutputs;
|
||||
remapOutputs.reserve(oldMemRefRank);
|
||||
|
||||
if (indexRemap &&
|
||||
indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
|
||||
// Remapped indices.
|
||||
for (auto resultExpr : indexRemap.getResults()) {
|
||||
auto singleResMap = AffineMap::get(
|
||||
indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr);
|
||||
auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
|
||||
remapOperands);
|
||||
remapOutputs.push_back(afOp);
|
||||
affineApplyOps.push_back(afOp);
|
||||
}
|
||||
} else {
|
||||
// No remapping specified.
|
||||
remapOutputs.assign(remapOperands.begin(), remapOperands.end());
|
||||
}
|
||||
|
||||
SmallVector<Value, 4> newMapOperands;
|
||||
newMapOperands.reserve(newMemRefRank);
|
||||
|
||||
// Prepend 'extraIndices' in 'newMapOperands'.
|
||||
for (Value extraIndex : extraIndices) {
|
||||
assert(extraIndex.getDefiningOp()->getNumResults() == 1 &&
|
||||
"single result op's expected to generate these indices");
|
||||
assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) &&
|
||||
"invalid memory op index");
|
||||
newMapOperands.push_back(extraIndex);
|
||||
}
|
||||
|
||||
// Append 'remapOutputs' to 'newMapOperands'.
|
||||
newMapOperands.append(remapOutputs.begin(), remapOutputs.end());
|
||||
|
||||
// Create new fully composed AffineMap for new op to be created.
|
||||
assert(newMapOperands.size() == newMemRefRank);
|
||||
auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
|
||||
// TODO: Avoid creating/deleting temporary AffineApplyOps here.
|
||||
fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
|
||||
newMap = simplifyAffineMap(newMap);
|
||||
canonicalizeMapAndOperands(&newMap, &newMapOperands);
|
||||
// Remove any affine.apply's that became dead as a result of composition.
|
||||
for (Value value : affineApplyOps)
|
||||
if (value.use_empty())
|
||||
value.getDefiningOp()->erase();
|
||||
|
||||
OperationState state(op->getLoc(), op->getName());
|
||||
// Construct the new operation using this memref.
|
||||
state.operands.reserve(op->getNumOperands() + extraIndices.size());
|
||||
// Insert the non-memref operands.
|
||||
state.operands.append(op->operand_begin(),
|
||||
op->operand_begin() + memRefOperandPos);
|
||||
// Insert the new memref value.
|
||||
state.operands.push_back(newMemRef);
|
||||
|
||||
// Insert the new memref map operands.
|
||||
state.operands.append(newMapOperands.begin(), newMapOperands.end());
|
||||
|
||||
// Insert the remaining operands unmodified.
|
||||
state.operands.append(op->operand_begin() + memRefOperandPos + 1 +
|
||||
oldMapNumInputs,
|
||||
op->operand_end());
|
||||
|
||||
// Result types don't change. Both memref's are of the same elemental type.
|
||||
state.types.reserve(op->getNumResults());
|
||||
for (auto result : op->getResults())
|
||||
state.types.push_back(result.getType());
|
||||
|
||||
// Add attribute for 'newMap', other Attributes do not change.
|
||||
auto newMapAttr = AffineMapAttr::get(newMap);
|
||||
for (auto namedAttr : op->getAttrs()) {
|
||||
if (namedAttr.getName() == oldMapAttrPair.getName())
|
||||
state.attributes.push_back({namedAttr.getName(), newMapAttr});
|
||||
else
|
||||
state.attributes.push_back(namedAttr);
|
||||
}
|
||||
|
||||
// Create the new operation.
|
||||
auto *repOp = builder.createOperation(state);
|
||||
op->replaceAllUsesWith(repOp);
|
||||
op->erase();
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult mlir::replaceAllMemRefUsesWith(
|
||||
Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices,
|
||||
AffineMap indexRemap, ArrayRef<Value> extraOperands,
|
||||
ArrayRef<Value> symbolOperands, Operation *domOpFilter,
|
||||
Operation *postDomOpFilter, bool allowNonDereferencingOps,
|
||||
bool replaceInDeallocOp) {
|
||||
unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
|
||||
(void)newMemRefRank; // unused in opt mode
|
||||
unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
|
||||
(void)oldMemRefRank;
|
||||
if (indexRemap) {
|
||||
assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
|
||||
"symbol operand count mismatch");
|
||||
assert(indexRemap.getNumInputs() ==
|
||||
extraOperands.size() + oldMemRefRank + symbolOperands.size());
|
||||
assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
|
||||
} else {
|
||||
assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
|
||||
}
|
||||
|
||||
// Assert same elemental type.
|
||||
assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
|
||||
newMemRef.getType().cast<MemRefType>().getElementType());
|
||||
|
||||
std::unique_ptr<DominanceInfo> domInfo;
|
||||
std::unique_ptr<PostDominanceInfo> postDomInfo;
|
||||
if (domOpFilter)
|
||||
domInfo =
|
||||
std::make_unique<DominanceInfo>(domOpFilter->getParentOfType<FuncOp>());
|
||||
|
||||
if (postDomOpFilter)
|
||||
postDomInfo = std::make_unique<PostDominanceInfo>(
|
||||
postDomOpFilter->getParentOfType<FuncOp>());
|
||||
|
||||
// Walk all uses of old memref; collect ops to perform replacement. We use a
|
||||
// DenseSet since an operation could potentially have multiple uses of a
|
||||
// memref (although rare), and the replacement later is going to erase ops.
|
||||
DenseSet<Operation *> opsToReplace;
|
||||
for (auto *op : oldMemRef.getUsers()) {
|
||||
// Skip this use if it's not dominated by domOpFilter.
|
||||
if (domOpFilter && !domInfo->dominates(domOpFilter, op))
|
||||
continue;
|
||||
|
||||
// Skip this use if it's not post-dominated by postDomOpFilter.
|
||||
if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op))
|
||||
continue;
|
||||
|
||||
// Skip dealloc's - no replacement is necessary, and a memref replacement
|
||||
// at other uses doesn't hurt these dealloc's.
|
||||
if (isa<memref::DeallocOp>(op) && !replaceInDeallocOp)
|
||||
continue;
|
||||
|
||||
// Check if the memref was used in a non-dereferencing context. It is fine
|
||||
// for the memref to be used in a non-dereferencing way outside of the
|
||||
// region where this replacement is happening.
|
||||
if (!isa<AffineMapAccessInterface>(*op)) {
|
||||
if (!allowNonDereferencingOps) {
|
||||
LLVM_DEBUG(llvm::dbgs()
|
||||
<< "Memref replacement failed: non-deferencing memref op: \n"
|
||||
<< *op << '\n');
|
||||
return failure();
|
||||
}
|
||||
// Non-dereferencing ops with the MemRefsNormalizable trait are
|
||||
// supported for replacement.
|
||||
if (!op->hasTrait<OpTrait::MemRefsNormalizable>()) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a "
|
||||
"memrefs normalizable trait: \n"
|
||||
<< *op << '\n');
|
||||
return failure();
|
||||
}
|
||||
}
|
||||
|
||||
// We'll first collect and then replace --- since replacement erases the op
|
||||
// that has the use, and that op could be postDomFilter or domFilter itself!
|
||||
opsToReplace.insert(op);
|
||||
}
|
||||
|
||||
for (auto *op : opsToReplace) {
|
||||
if (failed(replaceAllMemRefUsesWith(
|
||||
oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands,
|
||||
symbolOperands, allowNonDereferencingOps)))
|
||||
llvm_unreachable("memref replacement guaranteed to succeed here");
|
||||
}
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Given an operation, inserts one or more single result affine
|
||||
/// apply operations, results of which are exclusively used by this operation
|
||||
/// operation. The operands of these newly created affine apply ops are
|
||||
/// guaranteed to be loop iterators or terminal symbols of a function.
|
||||
///
|
||||
/// Before
|
||||
///
|
||||
/// affine.for %i = 0 to #map(%N)
|
||||
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// "send"(%idx, %A, ...)
|
||||
/// "compute"(%idx)
|
||||
///
|
||||
/// After
|
||||
///
|
||||
/// affine.for %i = 0 to #map(%N)
|
||||
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// "send"(%idx, %A, ...)
|
||||
/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
|
||||
/// "compute"(%idx_)
|
||||
///
|
||||
/// This allows applying different transformations on send and compute (for eg.
|
||||
/// different shifts/delays).
|
||||
///
|
||||
/// Returns nullptr either if none of opInst's operands were the result of an
|
||||
/// affine.apply and thus there was no affine computation slice to create, or if
|
||||
/// all the affine.apply op's supplying operands to this opInst did not have any
|
||||
/// uses besides this opInst; otherwise returns the list of affine.apply
|
||||
/// operations created in output argument `sliceOps`.
|
||||
void mlir::createAffineComputationSlice(
|
||||
Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) {
|
||||
// Collect all operands that are results of affine apply ops.
|
||||
SmallVector<Value, 4> subOperands;
|
||||
subOperands.reserve(opInst->getNumOperands());
|
||||
for (auto operand : opInst->getOperands())
|
||||
if (isa_and_nonnull<AffineApplyOp>(operand.getDefiningOp()))
|
||||
subOperands.push_back(operand);
|
||||
|
||||
// Gather sequence of AffineApplyOps reachable from 'subOperands'.
|
||||
SmallVector<Operation *, 4> affineApplyOps;
|
||||
getReachableAffineApplyOps(subOperands, affineApplyOps);
|
||||
// Skip transforming if there are no affine maps to compose.
|
||||
if (affineApplyOps.empty())
|
||||
return;
|
||||
|
||||
// Check if all uses of the affine apply op's lie only in this op op, in
|
||||
// which case there would be nothing to do.
|
||||
bool localized = true;
|
||||
for (auto *op : affineApplyOps) {
|
||||
for (auto result : op->getResults()) {
|
||||
for (auto *user : result.getUsers()) {
|
||||
if (user != opInst) {
|
||||
localized = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (localized)
|
||||
return;
|
||||
|
||||
OpBuilder builder(opInst);
|
||||
SmallVector<Value, 4> composedOpOperands(subOperands);
|
||||
auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
|
||||
fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
|
||||
|
||||
// Create an affine.apply for each of the map results.
|
||||
sliceOps->reserve(composedMap.getNumResults());
|
||||
for (auto resultExpr : composedMap.getResults()) {
|
||||
auto singleResMap = AffineMap::get(composedMap.getNumDims(),
|
||||
composedMap.getNumSymbols(), resultExpr);
|
||||
sliceOps->push_back(builder.create<AffineApplyOp>(
|
||||
opInst->getLoc(), singleResMap, composedOpOperands));
|
||||
}
|
||||
|
||||
// Construct the new operands that include the results from the composed
|
||||
// affine apply op above instead of existing ones (subOperands). So, they
|
||||
// differ from opInst's operands only for those operands in 'subOperands', for
|
||||
// which they will be replaced by the corresponding one from 'sliceOps'.
|
||||
SmallVector<Value, 4> newOperands(opInst->getOperands());
|
||||
for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
|
||||
// Replace the subOperands from among the new operands.
|
||||
unsigned j, f;
|
||||
for (j = 0, f = subOperands.size(); j < f; j++) {
|
||||
if (newOperands[i] == subOperands[j])
|
||||
break;
|
||||
}
|
||||
if (j < subOperands.size()) {
|
||||
newOperands[i] = (*sliceOps)[j];
|
||||
}
|
||||
}
|
||||
for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
|
||||
opInst->setOperand(idx, newOperands[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Enum to set patterns of affine expr in tiled-layout map.
|
||||
/// TileFloorDiv: <dim expr> div <tile size>
|
||||
/// TileMod: <dim expr> mod <tile size>
|
||||
/// TileNone: None of the above
|
||||
/// Example:
|
||||
/// #tiled_2d_128x256 = affine_map<(d0, d1)
|
||||
/// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
|
||||
/// "d0 div 128" and "d1 div 256" ==> TileFloorDiv
|
||||
/// "d0 mod 128" and "d1 mod 256" ==> TileMod
|
||||
enum TileExprPattern { TileFloorDiv, TileMod, TileNone };
|
||||
|
||||
/// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions
|
||||
/// being floordiv'ed by respective tile sizes appeare in a mod with the same
|
||||
/// tile sizes, and no other expression involves those k dimensions. This
|
||||
/// function stores a vector of tuples (`tileSizePos`) including AffineExpr for
|
||||
/// tile size, positions of corresponding `floordiv` and `mod`. If it is not a
|
||||
/// tiled layout, an empty vector is returned.
|
||||
static LogicalResult getTileSizePos(
|
||||
AffineMap map,
|
||||
SmallVectorImpl<std::tuple<AffineExpr, unsigned, unsigned>> &tileSizePos) {
|
||||
// Create `floordivExprs` which is a vector of tuples including LHS and RHS of
|
||||
// `floordiv` and its position in `map` output.
|
||||
// Example: #tiled_2d_128x256 = affine_map<(d0, d1)
|
||||
// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
|
||||
// In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}.
|
||||
SmallVector<std::tuple<AffineExpr, AffineExpr, unsigned>, 4> floordivExprs;
|
||||
unsigned pos = 0;
|
||||
for (AffineExpr expr : map.getResults()) {
|
||||
if (expr.getKind() == AffineExprKind::FloorDiv) {
|
||||
AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
|
||||
if (binaryExpr.getRHS().isa<AffineConstantExpr>())
|
||||
floordivExprs.emplace_back(
|
||||
std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos));
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
// Not tiled layout if `floordivExprs` is empty.
|
||||
if (floordivExprs.empty()) {
|
||||
tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
|
||||
return success();
|
||||
}
|
||||
|
||||
// Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is
|
||||
// not tiled layout.
|
||||
for (std::tuple<AffineExpr, AffineExpr, unsigned> fexpr : floordivExprs) {
|
||||
AffineExpr floordivExprLHS = std::get<0>(fexpr);
|
||||
AffineExpr floordivExprRHS = std::get<1>(fexpr);
|
||||
unsigned floordivPos = std::get<2>(fexpr);
|
||||
|
||||
// Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS
|
||||
// of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used
|
||||
// other expr, the map is not tiled layout. Example of non tiled layout:
|
||||
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)>
|
||||
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)>
|
||||
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod
|
||||
// 256)>
|
||||
bool found = false;
|
||||
pos = 0;
|
||||
for (AffineExpr expr : map.getResults()) {
|
||||
bool notTiled = false;
|
||||
if (pos != floordivPos) {
|
||||
expr.walk([&](AffineExpr e) {
|
||||
if (e == floordivExprLHS) {
|
||||
if (expr.getKind() == AffineExprKind::Mod) {
|
||||
AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
|
||||
// If LHS and RHS of `mod` are the same with those of floordiv.
|
||||
if (floordivExprLHS == binaryExpr.getLHS() &&
|
||||
floordivExprRHS == binaryExpr.getRHS()) {
|
||||
// Save tile size (RHS of `mod`), and position of `floordiv` and
|
||||
// `mod` if same expr with `mod` is not found yet.
|
||||
if (!found) {
|
||||
tileSizePos.emplace_back(
|
||||
std::make_tuple(binaryExpr.getRHS(), floordivPos, pos));
|
||||
found = true;
|
||||
} else {
|
||||
// Non tiled layout: Have multilpe `mod` with the same LHS.
|
||||
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
|
||||
// mod 256, d2 mod 256)>
|
||||
notTiled = true;
|
||||
}
|
||||
} else {
|
||||
// Non tiled layout: RHS of `mod` is different from `floordiv`.
|
||||
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
|
||||
// mod 128)>
|
||||
notTiled = true;
|
||||
}
|
||||
} else {
|
||||
// Non tiled layout: LHS is the same, but not `mod`.
|
||||
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
|
||||
// floordiv 256)>
|
||||
notTiled = true;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
if (notTiled) {
|
||||
tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
|
||||
return success();
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic
|
||||
/// after normalization. Dimensions that include dynamic dimensions in the map
|
||||
/// output will become dynamic dimensions. Return true if `dim` is dynamic
|
||||
/// dimension.
|
||||
///
|
||||
/// Example:
|
||||
/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
|
||||
///
|
||||
/// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic.
|
||||
/// memref<4x?xf32, #map0> ==> memref<4x?x?xf32>
|
||||
static bool
|
||||
isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap,
|
||||
SmallVectorImpl<unsigned> &inMemrefTypeDynDims,
|
||||
MLIRContext *context) {
|
||||
bool isDynamicDim = false;
|
||||
AffineExpr expr = layoutMap.getResults()[dim];
|
||||
// Check if affine expr of the dimension includes dynamic dimension of input
|
||||
// memrefType.
|
||||
expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) {
|
||||
if (e.isa<AffineDimExpr>()) {
|
||||
for (unsigned dm : inMemrefTypeDynDims) {
|
||||
if (e == getAffineDimExpr(dm, context)) {
|
||||
isDynamicDim = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
return isDynamicDim;
|
||||
}
|
||||
|
||||
/// Create affine expr to calculate dimension size for a tiled-layout map.
|
||||
static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput,
|
||||
TileExprPattern pat) {
|
||||
// Create map output for the patterns.
|
||||
// "floordiv <tile size>" ==> "ceildiv <tile size>"
|
||||
// "mod <tile size>" ==> "<tile size>"
|
||||
AffineExpr newMapOutput;
|
||||
AffineBinaryOpExpr binaryExpr = nullptr;
|
||||
switch (pat) {
|
||||
case TileExprPattern::TileMod:
|
||||
binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
|
||||
newMapOutput = binaryExpr.getRHS();
|
||||
break;
|
||||
case TileExprPattern::TileFloorDiv:
|
||||
binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
|
||||
newMapOutput = getAffineBinaryOpExpr(
|
||||
AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS());
|
||||
break;
|
||||
default:
|
||||
newMapOutput = oldMapOutput;
|
||||
}
|
||||
return newMapOutput;
|
||||
}
|
||||
|
||||
/// Create new maps to calculate each dimension size of `newMemRefType`, and
|
||||
/// create `newDynamicSizes` from them by using AffineApplyOp.
|
||||
///
|
||||
/// Steps for normalizing dynamic memrefs for a tiled layout map
|
||||
/// Example:
|
||||
/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
|
||||
/// %0 = dim %arg0, %c1 :memref<4x?xf32>
|
||||
/// %1 = alloc(%0) : memref<4x?xf32, #map0>
|
||||
///
|
||||
/// (Before this function)
|
||||
/// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only
|
||||
/// single layout map is supported.
|
||||
///
|
||||
/// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It
|
||||
/// is memref<4x?x?xf32> in the above example.
|
||||
///
|
||||
/// (In this function)
|
||||
/// 3. Create new maps to calculate each dimension of the normalized memrefType
|
||||
/// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the
|
||||
/// dimension size can be calculated by replacing "floordiv <tile size>" with
|
||||
/// "ceildiv <tile size>" and "mod <tile size>" with "<tile size>".
|
||||
/// - New map in the above example
|
||||
/// #map0 = affine_map<(d0, d1) -> (d0)>
|
||||
/// #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)>
|
||||
/// #map2 = affine_map<(d0, d1) -> (32)>
|
||||
///
|
||||
/// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp
|
||||
/// is used in dynamicSizes of new AllocOp.
|
||||
/// %0 = dim %arg0, %c1 : memref<4x?xf32>
|
||||
/// %c4 = arith.constant 4 : index
|
||||
/// %1 = affine.apply #map1(%c4, %0)
|
||||
/// %2 = affine.apply #map2(%c4, %0)
|
||||
static void createNewDynamicSizes(MemRefType oldMemRefType,
|
||||
MemRefType newMemRefType, AffineMap map,
|
||||
memref::AllocOp *allocOp, OpBuilder b,
|
||||
SmallVectorImpl<Value> &newDynamicSizes) {
|
||||
// Create new input for AffineApplyOp.
|
||||
SmallVector<Value, 4> inAffineApply;
|
||||
ArrayRef<int64_t> oldMemRefShape = oldMemRefType.getShape();
|
||||
unsigned dynIdx = 0;
|
||||
for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) {
|
||||
if (oldMemRefShape[d] < 0) {
|
||||
// Use dynamicSizes of allocOp for dynamic dimension.
|
||||
inAffineApply.emplace_back(allocOp->dynamicSizes()[dynIdx]);
|
||||
dynIdx++;
|
||||
} else {
|
||||
// Create ConstantOp for static dimension.
|
||||
Attribute constantAttr =
|
||||
b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]);
|
||||
inAffineApply.emplace_back(
|
||||
b.create<arith::ConstantOp>(allocOp->getLoc(), constantAttr));
|
||||
}
|
||||
}
|
||||
|
||||
// Create new map to calculate each dimension size of new memref for each
|
||||
// original map output. Only for dynamic dimesion of `newMemRefType`.
|
||||
unsigned newDimIdx = 0;
|
||||
ArrayRef<int64_t> newMemRefShape = newMemRefType.getShape();
|
||||
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
|
||||
(void)getTileSizePos(map, tileSizePos);
|
||||
for (AffineExpr expr : map.getResults()) {
|
||||
if (newMemRefShape[newDimIdx] < 0) {
|
||||
// Create new maps to calculate each dimension size of new memref.
|
||||
enum TileExprPattern pat = TileExprPattern::TileNone;
|
||||
for (auto pos : tileSizePos) {
|
||||
if (newDimIdx == std::get<1>(pos))
|
||||
pat = TileExprPattern::TileFloorDiv;
|
||||
else if (newDimIdx == std::get<2>(pos))
|
||||
pat = TileExprPattern::TileMod;
|
||||
}
|
||||
AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat);
|
||||
AffineMap newMap =
|
||||
AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput);
|
||||
Value affineApp =
|
||||
b.create<AffineApplyOp>(allocOp->getLoc(), newMap, inAffineApply);
|
||||
newDynamicSizes.emplace_back(affineApp);
|
||||
}
|
||||
newDimIdx++;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Currently works for static memrefs with a single layout map.
|
||||
LogicalResult mlir::normalizeMemRef(memref::AllocOp *allocOp) {
|
||||
MemRefType memrefType = allocOp->getType();
|
||||
OpBuilder b(*allocOp);
|
||||
|
||||
// Fetch a new memref type after normalizing the old memref to have an
|
||||
// identity map layout.
|
||||
MemRefType newMemRefType =
|
||||
normalizeMemRefType(memrefType, b, allocOp->symbolOperands().size());
|
||||
if (newMemRefType == memrefType)
|
||||
// Either memrefType already had an identity map or the map couldn't be
|
||||
// transformed to an identity map.
|
||||
return failure();
|
||||
|
||||
Value oldMemRef = allocOp->getResult();
|
||||
|
||||
SmallVector<Value, 4> symbolOperands(allocOp->symbolOperands());
|
||||
AffineMap layoutMap = memrefType.getLayout().getAffineMap();
|
||||
memref::AllocOp newAlloc;
|
||||
// Check if `layoutMap` is a tiled layout. Only single layout map is
|
||||
// supported for normalizing dynamic memrefs.
|
||||
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
|
||||
(void)getTileSizePos(layoutMap, tileSizePos);
|
||||
if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) {
|
||||
MemRefType oldMemRefType = oldMemRef.getType().cast<MemRefType>();
|
||||
SmallVector<Value, 4> newDynamicSizes;
|
||||
createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b,
|
||||
newDynamicSizes);
|
||||
// Add the new dynamic sizes in new AllocOp.
|
||||
newAlloc =
|
||||
b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
|
||||
newDynamicSizes, allocOp->alignmentAttr());
|
||||
} else {
|
||||
newAlloc = b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
|
||||
allocOp->alignmentAttr());
|
||||
}
|
||||
// Replace all uses of the old memref.
|
||||
if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc,
|
||||
/*extraIndices=*/{},
|
||||
/*indexRemap=*/layoutMap,
|
||||
/*extraOperands=*/{},
|
||||
/*symbolOperands=*/symbolOperands,
|
||||
/*domOpFilter=*/nullptr,
|
||||
/*postDomOpFilter=*/nullptr,
|
||||
/*allowNonDereferencingOps=*/true))) {
|
||||
// If it failed (due to escapes for example), bail out.
|
||||
newAlloc.erase();
|
||||
return failure();
|
||||
}
|
||||
// Replace any uses of the original alloc op and erase it. All remaining uses
|
||||
// have to be dealloc's; RAMUW above would've failed otherwise.
|
||||
assert(llvm::all_of(oldMemRef.getUsers(), [](Operation *op) {
|
||||
return isa<memref::DeallocOp>(op);
|
||||
}));
|
||||
oldMemRef.replaceAllUsesWith(newAlloc);
|
||||
allocOp->erase();
|
||||
return success();
|
||||
}
|
||||
|
||||
MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b,
|
||||
unsigned numSymbolicOperands) {
|
||||
unsigned rank = memrefType.getRank();
|
||||
if (rank == 0)
|
||||
return memrefType;
|
||||
|
||||
if (memrefType.getLayout().isIdentity()) {
|
||||
// Either no maps is associated with this memref or this memref has
|
||||
// a trivial (identity) map.
|
||||
return memrefType;
|
||||
}
|
||||
AffineMap layoutMap = memrefType.getLayout().getAffineMap();
|
||||
|
||||
// We don't do any checks for one-to-one'ness; we assume that it is
|
||||
// one-to-one.
|
||||
|
||||
// Normalize only static memrefs and dynamic memrefs with a tiled-layout map
|
||||
// for now.
|
||||
// TODO: Normalize the other types of dynamic memrefs.
|
||||
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
|
||||
(void)getTileSizePos(layoutMap, tileSizePos);
|
||||
if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty())
|
||||
return memrefType;
|
||||
|
||||
// We have a single map that is not an identity map. Create a new memref
|
||||
// with the right shape and an identity layout map.
|
||||
ArrayRef<int64_t> shape = memrefType.getShape();
|
||||
// FlatAffineConstraint may later on use symbolicOperands.
|
||||
FlatAffineConstraints fac(rank, numSymbolicOperands);
|
||||
SmallVector<unsigned, 4> memrefTypeDynDims;
|
||||
for (unsigned d = 0; d < rank; ++d) {
|
||||
// Use constraint system only in static dimensions.
|
||||
if (shape[d] > 0) {
|
||||
fac.addBound(FlatAffineConstraints::LB, d, 0);
|
||||
fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1);
|
||||
} else {
|
||||
memrefTypeDynDims.emplace_back(d);
|
||||
}
|
||||
}
|
||||
// We compose this map with the original index (logical) space to derive
|
||||
// the upper bounds for the new index space.
|
||||
unsigned newRank = layoutMap.getNumResults();
|
||||
if (failed(fac.composeMatchingMap(layoutMap)))
|
||||
return memrefType;
|
||||
// TODO: Handle semi-affine maps.
|
||||
// Project out the old data dimensions.
|
||||
fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds());
|
||||
SmallVector<int64_t, 4> newShape(newRank);
|
||||
for (unsigned d = 0; d < newRank; ++d) {
|
||||
// Check if each dimension of normalized memrefType is dynamic.
|
||||
bool isDynDim = isNormalizedMemRefDynamicDim(
|
||||
d, layoutMap, memrefTypeDynDims, b.getContext());
|
||||
if (isDynDim) {
|
||||
newShape[d] = -1;
|
||||
} else {
|
||||
// The lower bound for the shape is always zero.
|
||||
auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d);
|
||||
// For a static memref and an affine map with no symbols, this is
|
||||
// always bounded.
|
||||
assert(ubConst.hasValue() && "should always have an upper bound");
|
||||
if (ubConst.getValue() < 0)
|
||||
// This is due to an invalid map that maps to a negative space.
|
||||
return memrefType;
|
||||
// If dimension of new memrefType is dynamic, the value is -1.
|
||||
newShape[d] = ubConst.getValue() + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Create the new memref type after trivializing the old layout map.
|
||||
MemRefType newMemRefType =
|
||||
MemRefType::Builder(memrefType)
|
||||
.setShape(newShape)
|
||||
.setLayout(AffineMapAttr::get(b.getMultiDimIdentityMap(newRank)));
|
||||
|
||||
return newMemRefType;
|
||||
}
|
||||
@@ -3,6 +3,8 @@ add_mlir_library(MLIRAffineTransformsTestPasses
|
||||
TestAffineDataCopy.cpp
|
||||
TestAffineLoopUnswitching.cpp
|
||||
TestAffineLoopParametricTiling.cpp
|
||||
TestLoopFusion.cpp
|
||||
TestLoopMapping.cpp
|
||||
TestLoopPermutation.cpp
|
||||
TestVectorizationUtils.cpp
|
||||
|
||||
|
||||
@@ -13,10 +13,10 @@
|
||||
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
#define PASS_NAME "test-affine-data-copy"
|
||||
|
||||
@@ -12,8 +12,8 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
|
||||
@@ -12,11 +12,10 @@
|
||||
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopFusionUtils.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopFusionUtils.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
#define DEBUG_TYPE "test-loop-fusion"
|
||||
|
||||
@@ -12,11 +12,10 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
|
||||
@@ -12,9 +12,8 @@
|
||||
|
||||
#include "mlir/Dialect/Affine/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
#define PASS_NAME "test-loop-permutation"
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
|
||||
#include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/LoopUtils.h"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Dialect/Vector/VectorOps.h"
|
||||
#include "mlir/Dialect/Vector/VectorUtils.h"
|
||||
@@ -21,7 +22,6 @@
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/Diagnostics.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
# Exclude tests from libMLIR.so
|
||||
add_mlir_library(MLIRSCFTestPasses
|
||||
TestLoopParametricTiling.cpp
|
||||
TestLoopUnrolling.cpp
|
||||
TestSCFUtils.cpp
|
||||
|
||||
EXCLUDE_FROM_LIBMLIR
|
||||
|
||||
@@ -11,10 +11,9 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
#include "mlir/Dialect/SCF/Utils.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
@@ -31,8 +30,7 @@ public:
|
||||
}
|
||||
StringRef getDescription() const final {
|
||||
return "test application of parametric tiling to the outer loops so that "
|
||||
"the "
|
||||
"ranges of outer loops become static";
|
||||
"the ranges of outer loops become static";
|
||||
}
|
||||
SimpleParametricLoopTilingPass() = default;
|
||||
SimpleParametricLoopTilingPass(const SimpleParametricLoopTilingPass &) {}
|
||||
@@ -12,11 +12,10 @@
|
||||
|
||||
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
||||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
#include "mlir/Dialect/SCF/Utils.h"
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
@@ -2,10 +2,6 @@
|
||||
add_mlir_library(MLIRTestTransforms
|
||||
TestConstantFold.cpp
|
||||
TestInlining.cpp
|
||||
TestLoopFusion.cpp
|
||||
TestLoopMapping.cpp
|
||||
TestLoopParametricTiling.cpp
|
||||
TestLoopUnrolling.cpp
|
||||
|
||||
EXCLUDE_FROM_LIBMLIR
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/FoldUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
|
||||
@@ -4,4 +4,5 @@ add_mlir_unittest(MLIRTransformsTests
|
||||
)
|
||||
target_link_libraries(MLIRTransformsTests
|
||||
PRIVATE
|
||||
MLIRParser
|
||||
MLIRTransforms)
|
||||
|
||||
Reference in New Issue
Block a user