[mlir:Transforms] Move out the remaining non-dialect independent transforms and utilities

This has been a major TODO for a very long time, and is necessary for establishing a proper
dialect-free dependency layering for the Transforms library. Code was moved to effectively
two main locations:

* Affine/
There was quite a bit of affine dialect related code in Transforms/ do to historical reasons
(of a time way into MLIR's past). The following headers were moved to:
Transforms/LoopFusionUtils.h -> Dialect/Affine/LoopFusionUtils.h
Transforms/LoopUtils.h -> Dialect/Affine/LoopUtils.h
Transforms/Utils.h -> Dialect/Affine/Utils.h

The following transforms were also moved:
AffineLoopFusion, AffinePipelineDataTransfer, LoopCoalescing

* SCF/
Only one SCF pass was in Transforms/ (likely accidentally placed here): ParallelLoopCollapsing
The SCF specific utilities in LoopUtils have been moved to SCF/Utils.h

* Misc:
mlir::moveLoopInvariantCode was also moved to LoopLikeInterface.h given
that it is a simple utility defined in terms of LoopLikeOpInterface.

Differential Revision: https://reviews.llvm.org/D117848
This commit is contained in:
River Riddle
2022-01-20 17:32:31 -08:00
parent 2e2c0738e8
commit a70aa7bb0d
70 changed files with 2129 additions and 2081 deletions

View File

@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
#define MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
#ifndef MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
#define MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
#include "mlir/IR/Value.h"
#include "mlir/Support/LLVM.h"
@@ -167,4 +167,4 @@ void gatherProducerConsumerMemrefs(ArrayRef<Operation *> srcOps,
DenseSet<Value> &producerConsumerMemrefs);
} // namespace mlir
#endif // MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
#endif // MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H

View File

@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_TRANSFORMS_LOOPUTILS_H
#define MLIR_TRANSFORMS_LOOPUTILS_H
#ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H
#define MLIR_DIALECT_AFFINE_LOOPUTILS_H
#include "mlir/IR/Block.h"
#include "mlir/Support/LLVM.h"
@@ -45,9 +45,6 @@ LogicalResult loopUnrollFull(AffineForOp forOp);
LogicalResult loopUnrollByFactor(
AffineForOp forOp, uint64_t unrollFactor,
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr);
LogicalResult loopUnrollByFactor(
scf::ForOp forOp, uint64_t unrollFactor,
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr);
/// Unrolls this loop by the specified unroll factor or its trip count,
/// whichever is lower.
@@ -63,8 +60,6 @@ bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops);
/// AffineForOp, and the second op is a terminator).
void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
AffineForOp root);
void getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
scf::ForOp root);
/// Unrolls and jams this loop by the specified factor. `forOp` can be a loop
/// with iteration arguments performing supported reductions and its inner loops
@@ -78,10 +73,9 @@ LogicalResult loopUnrollJamByFactor(AffineForOp forOp,
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
uint64_t unrollJamFactor);
/// Promotes the loop body of a AffineForOp/scf::ForOp to its containing block
/// if the loop was known to have a single iteration.
/// Promotes the loop body of a AffineForOp to its containing block if the loop
/// was known to have a single iteration.
LogicalResult promoteIfSingleIteration(AffineForOp forOp);
LogicalResult promoteIfSingleIteration(scf::ForOp forOp);
/// Promotes all single iteration AffineForOp's in the Function, i.e., moves
/// their body into the containing Block.
@@ -146,13 +140,9 @@ AffineForOp sinkSequentialLoops(AffineForOp forOp);
/// occurrence in `forOps`, under each of the `targets`.
/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
/// nested immediately under each of `targets`.
using Loops = SmallVector<scf::ForOp, 8>;
using TileLoops = std::pair<Loops, Loops>;
SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
ArrayRef<uint64_t> sizes,
ArrayRef<AffineForOp> targets);
SmallVector<Loops, 8> tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
ArrayRef<scf::ForOp> targets);
/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
/// and sinking them, in their order of occurrence in `forOps`, under `target`.
@@ -160,15 +150,6 @@ SmallVector<Loops, 8> tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
/// `target`.
SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
ArrayRef<uint64_t> sizes, AffineForOp target);
Loops tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
scf::ForOp target);
/// Tile a nest of scf::ForOp loops rooted at `rootForOp` with the given
/// (parametric) sizes. Sizes are expected to be strictly positive values at
/// runtime. If more sizes than loops are provided, discard the trailing values
/// in sizes. Assumes the loop nest is permutable.
/// Returns the newly created intra-tile loops.
Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes);
/// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
struct AffineCopyOptions {
@@ -236,16 +217,6 @@ LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion,
const AffineCopyOptions &copyOptions,
CopyGenerateResult &result);
/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
/// parametric tile sizes that the outer loops have a fixed number of iterations
/// as defined in `sizes`.
TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef<int64_t> sizes);
/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
/// `loops` contains a list of perfectly nested loops with bounds and steps
/// independent of any loop induction variable involved in the nest.
void coalesceLoops(MutableArrayRef<scf::ForOp> loops);
/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
/// `loops` contains a list of perfectly nested loops outermost to innermost
/// that are normalized (step one and lower bound of zero) and with bounds and
@@ -254,12 +225,6 @@ void coalesceLoops(MutableArrayRef<scf::ForOp> loops);
/// be representable using affine.for.
LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops);
/// Take the ParallelLoop and for each set of dimension indices, combine them
/// into a single dimension. combinedDimensions must contain each index into
/// loops exactly once.
void collapseParallelLoops(scf::ParallelOp loops,
ArrayRef<std::vector<unsigned>> combinedDimensions);
/// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
/// size given by `numProcessors`. This is achieved by embedding the SSA values
/// corresponding to `processorIds` and `numProcessors` into the bounds and step
@@ -321,9 +286,6 @@ LogicalResult
separateFullTiles(MutableArrayRef<AffineForOp> nest,
SmallVectorImpl<AffineForOp> *fullTileNest = nullptr);
/// Move loop invariant code out of `looplike`.
LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike);
} // namespace mlir
#endif // MLIR_TRANSFORMS_LOOPUTILS_H
#endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H

View File

@@ -21,6 +21,10 @@ namespace mlir {
class AffineForOp;
/// Fusion mode to attempt. The default mode `Greedy` does both
/// producer-consumer and sibling fusion.
enum FusionMode { Greedy, ProducerConsumer, Sibling };
/// Creates a simplification pass for affine structures (maps and sets). In
/// addition, this pass also normalizes memrefs to have the trivial (identity)
/// layout map.
@@ -53,6 +57,19 @@ std::unique_ptr<OperationPass<FuncOp>> createAffineDataCopyGenerationPass();
/// dead allocs.
std::unique_ptr<OperationPass<FuncOp>> createAffineScalarReplacementPass();
/// Creates a pass that transforms perfectly nested loops with independent
/// bounds into a single loop.
std::unique_ptr<OperationPass<FuncOp>> createLoopCoalescingPass();
/// Creates a loop fusion pass which fuses loops according to type of fusion
/// specified in `fusionMode`. Buffers of size less than or equal to
/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`.
std::unique_ptr<OperationPass<FuncOp>>
createLoopFusionPass(unsigned fastMemorySpace = 0,
uint64_t localBufSizeThreshold = 0,
bool maximalFusion = false,
enum FusionMode fusionMode = FusionMode::Greedy);
/// Creates a pass to perform tiling on loop nests.
std::unique_ptr<OperationPass<FuncOp>>
createLoopTilingPass(uint64_t cacheSizeBytes);
@@ -76,6 +93,10 @@ std::unique_ptr<OperationPass<FuncOp>> createLoopUnrollPass(
std::unique_ptr<OperationPass<FuncOp>>
createLoopUnrollAndJamPass(int unrollJamFactor = -1);
/// Creates a pass to pipeline explicit movement of data across levels of the
/// memory hierarchy.
std::unique_ptr<OperationPass<FuncOp>> createPipelineDataTransferPass();
/// Creates a pass to vectorize loops, operations and data types using a
/// target-independent, n-D super-vector abstraction.
std::unique_ptr<OperationPass<FuncOp>>

View File

@@ -43,6 +43,138 @@ def AffineDataCopyGeneration : Pass<"affine-data-copy-generate", "FuncOp"> {
];
}
def AffineLoopFusion : Pass<"affine-loop-fusion", "FuncOp"> {
let summary = "Fuse affine loop nests";
let description = [{
This pass performs fusion of loop nests using a slicing-based approach. It
combines two fusion strategies: producer-consumer fusion and sibling fusion.
Producer-consumer fusion is aimed at fusing pairs of loops where the first
one writes to a memref that the second reads. Sibling fusion targets pairs
of loops that share no dependences between them but that load from the same
memref. The fused loop nests, when possible, are rewritten to access
significantly smaller local buffers instead of the original memref's, and
the latter are often either completely optimized away or contracted. This
transformation leads to enhanced locality and lower memory footprint through
the elimination or contraction of temporaries/intermediate memref's. These
benefits are sometimes achieved at the expense of redundant computation
through a cost model that evaluates available choices such as the depth at
which a source slice should be materialized in the designation slice.
Example 1: Producer-consumer fusion.
Input:
```mlir
func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
%0 = memref.alloc() : memref<10xf32>
%1 = memref.alloc() : memref<10xf32>
%cst = arith.constant 0.000000e+00 : f32
affine.for %arg2 = 0 to 10 {
affine.store %cst, %0[%arg2] : memref<10xf32>
affine.store %cst, %1[%arg2] : memref<10xf32>
}
affine.for %arg2 = 0 to 10 {
%2 = affine.load %0[%arg2] : memref<10xf32>
%3 = arith.addf %2, %2 : f32
affine.store %3, %arg0[%arg2] : memref<10xf32>
}
affine.for %arg2 = 0 to 10 {
%2 = affine.load %1[%arg2] : memref<10xf32>
%3 = arith.mulf %2, %2 : f32
affine.store %3, %arg1[%arg2] : memref<10xf32>
}
return
}
```
Output:
```mlir
func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
%0 = memref.alloc() : memref<1xf32>
%1 = memref.alloc() : memref<1xf32>
%cst = arith.constant 0.000000e+00 : f32
affine.for %arg2 = 0 to 10 {
affine.store %cst, %0[0] : memref<1xf32>
affine.store %cst, %1[0] : memref<1xf32>
%2 = affine.load %1[0] : memref<1xf32>
%3 = arith.mulf %2, %2 : f32
affine.store %3, %arg1[%arg2] : memref<10xf32>
%4 = affine.load %0[0] : memref<1xf32>
%5 = arith.addf %4, %4 : f32
affine.store %5, %arg0[%arg2] : memref<10xf32>
}
return
}
```
Example 2: Sibling fusion.
Input:
```mlir
func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
%arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
%arg4: memref<10x10xf32>) {
affine.for %arg5 = 0 to 3 {
affine.for %arg6 = 0 to 3 {
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
%1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
%2 = arith.mulf %0, %1 : f32
affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
}
}
affine.for %arg5 = 0 to 3 {
affine.for %arg6 = 0 to 3 {
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
%1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
%2 = arith.addf %0, %1 : f32
affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32>
}
}
return
}
```
Output:
```mlir
func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
%arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
%arg4: memref<10x10xf32>) {
affine.for %arg5 = 0 to 3 {
affine.for %arg6 = 0 to 3 {
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
%1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
%2 = arith.mulf %0, %1 : f32
affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
%3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
%4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
%5 = arith.addf %3, %4 : f32
affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32>
}
}
return
}
```
}];
let constructor = "mlir::createLoopFusionPass()";
let options = [
Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",
/*default=*/"0.30f", "Fractional increase in additional computation "
"tolerated while fusing">,
Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned",
/*default=*/"0",
"Faster memory space number to promote fusion buffers to">,
Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t",
/*default=*/"0", "Threshold size (KiB) for promoting local buffers "
"to fast memory space">,
Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
"Enables maximal loop fusion">,
Option<"affineFusionMode", "mode", "enum FusionMode",
"mlir::FusionMode::Greedy", "fusion mode to attempt",
"llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy,"
" \"greedy\", \"Perform greedy (both producer-consumer and sibling) fusion\"), "
"clEnumValN( mlir::FusionMode::ProducerConsumer, "
"\"producer\", \"Perform only producer-consumer fusion\"), "
"clEnumValN( mlir::FusionMode::Sibling, "
"\"sibling\", \"Perform only sibling fusion\"))">,
];
let dependentDialects = ["memref::MemRefDialect"];
}
def AffineLoopInvariantCodeMotion
: Pass<"affine-loop-invariant-code-motion", "FuncOp"> {
let summary = "Hoist loop invariant instructions outside of affine loops";
@@ -94,6 +226,75 @@ def AffineLoopUnrollAndJam : Pass<"affine-loop-unroll-jam", "FuncOp"> {
];
}
def AffinePipelineDataTransfer
: Pass<"affine-pipeline-data-transfer", "FuncOp"> {
let summary = "Pipeline non-blocking data transfers between explicitly "
"managed levels of the memory hierarchy";
let description = [{
This pass performs a transformation to overlap non-blocking DMA operations
in a loop with computations through double buffering. This is achieved by
advancing dma_start operations with respect to other operations.
Input
```mlir
func @pipelinedatatransfer() {
%0 = memref.alloc() : memref<256xf32>
%1 = memref.alloc() : memref<32xf32, 1>
%2 = memref.alloc() : memref<1xf32>
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
affine.for %i0 = 0 to 8 {
affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
affine.dma_wait %2[%c0], %c128 : memref<1xf32>
%3 = affine.load %1[%i0] : memref<32xf32, 1>
%4 = "compute"(%3) : (f32) -> f32
affine.store %4, %1[%i0] : memref<32xf32, 1>
}
return
}
```
Output
```mlir
module {
func @pipelinedatatransfer() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%0 = memref.alloc() : memref<256xf32>
%c0_0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%1 = memref.alloc() : memref<2x32xf32, 1>
%2 = memref.alloc() : memref<2x1xf32>
affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
affine.for %arg0 = 1 to 8 {
affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
%8 = affine.apply #map3(%arg0)
%9 = affine.apply #map4(%8)
%10 = affine.apply #map4(%8)
affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
%11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
%12 = "compute"(%11) : (f32) -> f32
affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
}
%3 = affine.apply #map3(%c8)
%4 = affine.apply #map4(%3)
%5 = affine.apply #map4(%3)
affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
%6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
%7 = "compute"(%6) : (f32) -> f32
affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
memref.dealloc %2 : memref<2x1xf32>
memref.dealloc %1 : memref<2x32xf32, 1>
return
}
}
```
}];
let constructor = "mlir::createPipelineDataTransferPass()";
}
def AffineScalarReplacement : Pass<"affine-scalrep", "FuncOp"> {
let summary = "Replace affine memref acceses by scalars by forwarding stores "
"to loads and eliminating redundant loads";
@@ -184,6 +385,13 @@ def AffineLoopNormalize : Pass<"affine-loop-normalize", "FuncOp"> {
let constructor = "mlir::createAffineLoopNormalizePass()";
}
def LoopCoalescing : Pass<"loop-coalescing", "FuncOp"> {
let summary = "Coalesce nested loops with independent bounds into a single "
"loop";
let constructor = "mlir::createLoopCoalescingPass()";
let dependentDialects = ["arith::ArithmeticDialect"];
}
def SimplifyAffineStructures : Pass<"simplify-affine-structures", "FuncOp"> {
let summary = "Simplify affine expressions in maps/sets and normalize "
"memrefs";

View File

@@ -24,6 +24,10 @@ class DominanceInfo;
class Operation;
class PostDominanceInfo;
namespace memref {
class AllocOp;
} // namespace memref
struct LogicalResult;
using ReductionLoopMap = DenseMap<Operation *, SmallVector<LoopReduction, 2>>;
@@ -168,6 +172,121 @@ void normalizeAffineFor(AffineForOp op);
AffineExpr substWithMin(AffineExpr e, AffineExpr dim, AffineExpr min,
AffineExpr max, bool positivePath = true);
/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
/// optionally remapping the old memref's indices using the supplied affine map,
/// `indexRemap`. The new memref could be of a different shape or rank.
/// `extraIndices` provides any additional access indices to be added to the
/// start.
///
/// `indexRemap` remaps indices of the old memref access to a new set of indices
/// that are used to index the memref. Additional input operands to indexRemap
/// can be optionally provided in `extraOperands`, and they occupy the start
/// of its input list. `indexRemap`'s dimensional inputs are expected to
/// correspond to memref's indices, and its symbolic inputs if any should be
/// provided in `symbolOperands`.
///
/// `domOpFilter`, if non-null, restricts the replacement to only those
/// operations that are dominated by the former; similarly, `postDomOpFilter`
/// restricts replacement to only those operations that are postdominated by it.
///
/// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing
/// uses of a memref without any requirement for access index rewrites as long
/// as the user operation has the MemRefsNormalizable trait. The default value
/// of this flag is false.
///
/// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to
/// also be a candidate for replacement. The default value of this flag is
/// false.
///
/// Returns true on success and false if the replacement is not possible,
/// whenever a memref is used as an operand in a non-dereferencing context and
/// 'allowNonDereferencingOps' is false, except for dealloc's on the memref
/// which are left untouched. See comments at function definition for an
/// example.
//
// Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
// The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
// index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
// d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
// extra operands, note that 'indexRemap' would just be applied to existing
// indices (%i, %j).
// TODO: allow extraIndices to be added at any position.
LogicalResult replaceAllMemRefUsesWith(
Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices = {},
AffineMap indexRemap = AffineMap(), ArrayRef<Value> extraOperands = {},
ArrayRef<Value> symbolOperands = {}, Operation *domOpFilter = nullptr,
Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false,
bool replaceInDeallocOp = false);
/// Performs the same replacement as the other version above but only for the
/// dereferencing uses of `oldMemRef` in `op`, except in cases where
/// 'allowNonDereferencingOps' is set to true where we replace the
/// non-dereferencing uses as well.
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
Operation *op,
ArrayRef<Value> extraIndices = {},
AffineMap indexRemap = AffineMap(),
ArrayRef<Value> extraOperands = {},
ArrayRef<Value> symbolOperands = {},
bool allowNonDereferencingOps = false);
/// Rewrites the memref defined by this alloc op to have an identity layout map
/// and updates all its indexing uses. Returns failure if any of its uses
/// escape (while leaving the IR in a valid state).
LogicalResult normalizeMemRef(memref::AllocOp *op);
/// Uses the old memref type map layout and computes the new memref type to have
/// a new shape and a layout map, where the old layout map has been normalized
/// to an identity layout map. It returns the old memref in case no
/// normalization was needed or a failure occurs while transforming the old map
/// layout to an identity layout map.
MemRefType normalizeMemRefType(MemRefType memrefType, OpBuilder builder,
unsigned numSymbolicOperands);
/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
/// its results equal to the number of operands, as a composition
/// of all other AffineApplyOps reachable from input parameter 'operands'. If
/// different operands were drawing results from multiple affine apply ops,
/// these will also be collected into a single (multi-result) affine apply op.
/// The final results of the composed AffineApplyOp are returned in output
/// parameter 'results'. Returns the affine apply op created.
Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc,
ArrayRef<Value> operands,
ArrayRef<Operation *> affineApplyOps,
SmallVectorImpl<Value> *results);
/// Given an operation, inserts one or more single result affine apply
/// operations, results of which are exclusively used by this operation.
/// The operands of these newly created affine apply ops are
/// guaranteed to be loop iterators or terminal symbols of a function.
///
/// Before
///
/// affine.for %i = 0 to #map(%N)
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
/// send %A[%idx], ...
/// %v = "compute"(%idx, ...)
///
/// After
///
/// affine.for %i = 0 to #map(%N)
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
/// send %A[%idx], ...
/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
/// %v = "compute"(%idx_, ...)
/// This allows the application of different transformations on send and
/// compute (for eg. different shifts/delays)
///
/// Fills `sliceOps` with the list of affine.apply operations.
/// In the following cases, `sliceOps` remains empty:
/// 1. If none of opInst's operands were the result of an affine.apply
/// (i.e., there was no affine computation slice to create).
/// 2. If all the affine.apply op's supplying operands to this opInst did not
/// have any uses other than those in this opInst.
void createAffineComputationSlice(Operation *opInst,
SmallVectorImpl<AffineApplyOp> *sliceOps);
} // namespace mlir
#endif // MLIR_DIALECT_AFFINE_UTILS_H

View File

@@ -32,6 +32,10 @@ std::unique_ptr<Pass> createForLoopPeelingPass();
/// inside of scf.for loops with known lower and upper bounds.
std::unique_ptr<Pass> createSCFForLoopCanonicalizationPass();
/// Creates a pass that transforms a single ParallelLoop over N induction
/// variables into another ParallelLoop over less than N induction variables.
std::unique_ptr<Pass> createParallelLoopCollapsingPass();
/// Creates a loop fusion pass which fuses parallel loops.
std::unique_ptr<Pass> createParallelLoopFusionPass();

View File

@@ -52,6 +52,22 @@ def SCFParallelLoopFusion : Pass<"parallel-loop-fusion"> {
let constructor = "mlir::createParallelLoopFusionPass()";
}
def SCFParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> {
let summary = "Collapse parallel loops to use less induction variables";
let constructor = "mlir::createParallelLoopCollapsingPass()";
let options = [
ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned",
"Which loop indices to combine 0th loop index",
"llvm::cl::MiscFlags::CommaSeparated">,
ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned",
"Which loop indices to combine into the position 1 loop index",
"llvm::cl::MiscFlags::CommaSeparated">,
ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned",
"Which loop indices to combine into the position 2 loop index",
"llvm::cl::MiscFlags::CommaSeparated">,
];
}
def SCFParallelLoopSpecialization
: Pass<"parallel-loop-specialization", "FuncOp"> {
let summary = "Specialize parallel loops for vectorization";

View File

@@ -98,5 +98,65 @@ getSCFMinMaxExpr(Value value, SmallVectorImpl<Value> &dims,
SmallVectorImpl<Value> &symbols,
llvm::function_ref<bool(Operation *)> loopFilter = nullptr);
/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
/// `loops` contains a list of perfectly nested loops with bounds and steps
/// independent of any loop induction variable involved in the nest.
void coalesceLoops(MutableArrayRef<scf::ForOp> loops);
/// Take the ParallelLoop and for each set of dimension indices, combine them
/// into a single dimension. combinedDimensions must contain each index into
/// loops exactly once.
void collapseParallelLoops(scf::ParallelOp loops,
ArrayRef<std::vector<unsigned>> combinedDimensions);
/// Promotes the loop body of a scf::ForOp to its containing block if the loop
/// was known to have a single iteration.
LogicalResult promoteIfSingleIteration(scf::ForOp forOp);
/// Unrolls this for operation by the specified unroll factor. Returns failure
/// if the loop cannot be unrolled either due to restrictions or due to invalid
/// unroll factors. Requires positive loop bounds and step. If specified,
/// annotates the Ops in each unrolled iteration by applying `annotateFn`.
LogicalResult loopUnrollByFactor(
scf::ForOp forOp, uint64_t unrollFactor,
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr);
/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
/// parametric tile sizes that the outer loops have a fixed number of iterations
/// as defined in `sizes`.
using Loops = SmallVector<scf::ForOp, 8>;
using TileLoops = std::pair<Loops, Loops>;
TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef<int64_t> sizes);
/// Performs tiling fo imperfectly nested loops (with interchange) by
/// strip-mining the `forOps` by `sizes` and sinking them, in their order of
/// occurrence in `forOps`, under each of the `targets`.
/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
/// nested immediately under each of `targets`.
SmallVector<Loops, 8> tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
ArrayRef<scf::ForOp> targets);
/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
/// and sinking them, in their order of occurrence in `forOps`, under `target`.
/// Returns the new AffineForOps, one per `forOps`, nested immediately under
/// `target`.
Loops tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
scf::ForOp target);
/// Tile a nest of scf::ForOp loops rooted at `rootForOp` with the given
/// (parametric) sizes. Sizes are expected to be strictly positive values at
/// runtime. If more sizes than loops are provided, discard the trailing values
/// in sizes. Assumes the loop nest is permutable.
/// Returns the newly created intra-tile loops.
Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes);
/// Get perfectly nested sequence of loops starting at root of loop nest
/// (the first op being another AffineFor, and the second op - a terminator).
/// A loop is perfectly nested iff: the first op in the loop's body is another
/// AffineForOp, and the second op is a terminator).
void getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
scf::ForOp root);
} // namespace mlir
#endif // MLIR_DIALECT_SCF_UTILS_H_

View File

@@ -15,7 +15,20 @@
#include "mlir/IR/OpDefinition.h"
//===----------------------------------------------------------------------===//
// LoopLike Interfaces
//===----------------------------------------------------------------------===//
/// Include the generated interface declarations.
#include "mlir/Interfaces/LoopLikeInterface.h.inc"
//===----------------------------------------------------------------------===//
// LoopLike Utilities
//===----------------------------------------------------------------------===//
namespace mlir {
/// Move loop invariant code out of a `looplike` operation.
LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike);
} // namespace mlir
#endif // MLIR_INTERFACES_LOOPLIKEINTERFACE_H_

View File

@@ -0,0 +1,70 @@
//===- ControlFlowSinkUtils.h - ControlFlow Sink Utils ----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H
#define MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H
#include "mlir/Support/LLVM.h"
namespace mlir {
class DominanceInfo;
class Operation;
class Region;
class RegionBranchOpInterface;
/// Given a list of regions, perform control flow sinking on them. For each
/// region, control-flow sinking moves operations that dominate the region but
/// whose only users are in the region into the regions so that they aren't
/// executed on paths where their results are not needed.
///
/// TODO: For the moment, this is a *simple* control-flow sink, i.e., no
/// duplicating of ops. It should be made to accept a cost model to determine
/// whether duplicating a particular op is profitable.
///
/// Example:
///
/// ```mlir
/// %0 = arith.addi %arg0, %arg1
/// scf.if %cond {
/// scf.yield %0
/// } else {
/// scf.yield %arg2
/// }
/// ```
///
/// After control-flow sink:
///
/// ```mlir
/// scf.if %cond {
/// %0 = arith.addi %arg0, %arg1
/// scf.yield %0
/// } else {
/// scf.yield %arg2
/// }
/// ```
///
/// Users must supply a callback `shouldMoveIntoRegion` that determines whether
/// the given operation that only has users in the given operation should be
/// moved into that region.
///
/// Returns the number of operations sunk.
size_t
controlFlowSink(ArrayRef<Region *> regions, DominanceInfo &domInfo,
function_ref<bool(Operation *, Region *)> shouldMoveIntoRegion);
/// Populates `regions` with regions of the provided region branch op that are
/// executed at most once at that are reachable given the current operands of
/// the op. These regions can be passed to `controlFlowSink` to perform sinking
/// on the regions of the operation.
void getSinglyExecutedRegionsToSink(RegionBranchOpInterface branch,
SmallVectorImpl<Region *> &regions);
} // namespace mlir
#endif // MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H

View File

@@ -22,13 +22,8 @@
namespace mlir {
class AffineForOp;
class GreedyRewriteConfig;
/// Fusion mode to attempt. The default mode `Greedy` does both
/// producer-consumer and sibling fusion.
enum FusionMode { Greedy, ProducerConsumer, Sibling };
//===----------------------------------------------------------------------===//
// Passes
//===----------------------------------------------------------------------===//
@@ -56,31 +51,10 @@ std::unique_ptr<Pass> createControlFlowSinkPass();
/// Creates a pass to perform common sub expression elimination.
std::unique_ptr<Pass> createCSEPass();
/// Creates a loop fusion pass which fuses loops according to type of fusion
/// specified in `fusionMode`. Buffers of size less than or equal to
/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`.
std::unique_ptr<OperationPass<FuncOp>>
createLoopFusionPass(unsigned fastMemorySpace = 0,
uint64_t localBufSizeThreshold = 0,
bool maximalFusion = false,
enum FusionMode fusionMode = FusionMode::Greedy);
/// Creates a loop invariant code motion pass that hoists loop invariant
/// instructions out of the loop.
std::unique_ptr<Pass> createLoopInvariantCodeMotionPass();
/// Creates a pass to pipeline explicit movement of data across levels of the
/// memory hierarchy.
std::unique_ptr<OperationPass<FuncOp>> createPipelineDataTransferPass();
/// Creates a pass that transforms perfectly nested loops with independent
/// bounds into a single loop.
std::unique_ptr<OperationPass<FuncOp>> createLoopCoalescingPass();
/// Creates a pass that transforms a single ParallelLoop over N induction
/// variables into another ParallelLoop over less than N induction variables.
std::unique_ptr<Pass> createParallelLoopCollapsingPass();
/// Creates a pass to strip debug information from a function.
std::unique_ptr<Pass> createStripDebugInfoPass();

View File

@@ -16,207 +16,6 @@
include "mlir/Pass/PassBase.td"
include "mlir/Rewrite/PassUtil.td"
def AffineLoopFusion : Pass<"affine-loop-fusion", "FuncOp"> {
let summary = "Fuse affine loop nests";
let description = [{
This pass performs fusion of loop nests using a slicing-based approach. It
combines two fusion strategies: producer-consumer fusion and sibling fusion.
Producer-consumer fusion is aimed at fusing pairs of loops where the first
one writes to a memref that the second reads. Sibling fusion targets pairs
of loops that share no dependences between them but that load from the same
memref. The fused loop nests, when possible, are rewritten to access
significantly smaller local buffers instead of the original memref's, and
the latter are often either completely optimized away or contracted. This
transformation leads to enhanced locality and lower memory footprint through
the elimination or contraction of temporaries/intermediate memref's. These
benefits are sometimes achieved at the expense of redundant computation
through a cost model that evaluates available choices such as the depth at
which a source slice should be materialized in the designation slice.
Example 1: Producer-consumer fusion.
Input:
```mlir
func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
%0 = memref.alloc() : memref<10xf32>
%1 = memref.alloc() : memref<10xf32>
%cst = arith.constant 0.000000e+00 : f32
affine.for %arg2 = 0 to 10 {
affine.store %cst, %0[%arg2] : memref<10xf32>
affine.store %cst, %1[%arg2] : memref<10xf32>
}
affine.for %arg2 = 0 to 10 {
%2 = affine.load %0[%arg2] : memref<10xf32>
%3 = arith.addf %2, %2 : f32
affine.store %3, %arg0[%arg2] : memref<10xf32>
}
affine.for %arg2 = 0 to 10 {
%2 = affine.load %1[%arg2] : memref<10xf32>
%3 = arith.mulf %2, %2 : f32
affine.store %3, %arg1[%arg2] : memref<10xf32>
}
return
}
```
Output:
```mlir
func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
%0 = memref.alloc() : memref<1xf32>
%1 = memref.alloc() : memref<1xf32>
%cst = arith.constant 0.000000e+00 : f32
affine.for %arg2 = 0 to 10 {
affine.store %cst, %0[0] : memref<1xf32>
affine.store %cst, %1[0] : memref<1xf32>
%2 = affine.load %1[0] : memref<1xf32>
%3 = arith.mulf %2, %2 : f32
affine.store %3, %arg1[%arg2] : memref<10xf32>
%4 = affine.load %0[0] : memref<1xf32>
%5 = arith.addf %4, %4 : f32
affine.store %5, %arg0[%arg2] : memref<10xf32>
}
return
}
```
Example 2: Sibling fusion.
Input:
```mlir
func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
%arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
%arg4: memref<10x10xf32>) {
affine.for %arg5 = 0 to 3 {
affine.for %arg6 = 0 to 3 {
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
%1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
%2 = arith.mulf %0, %1 : f32
affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
}
}
affine.for %arg5 = 0 to 3 {
affine.for %arg6 = 0 to 3 {
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
%1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
%2 = arith.addf %0, %1 : f32
affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32>
}
}
return
}
```
Output:
```mlir
func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
%arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
%arg4: memref<10x10xf32>) {
affine.for %arg5 = 0 to 3 {
affine.for %arg6 = 0 to 3 {
%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
%1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
%2 = arith.mulf %0, %1 : f32
affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
%3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
%4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
%5 = arith.addf %3, %4 : f32
affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32>
}
}
return
}
```
}];
let constructor = "mlir::createLoopFusionPass()";
let options = [
Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",
/*default=*/"0.30f", "Fractional increase in additional computation "
"tolerated while fusing">,
Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned",
/*default=*/"0",
"Faster memory space number to promote fusion buffers to">,
Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t",
/*default=*/"0", "Threshold size (KiB) for promoting local buffers "
"to fast memory space">,
Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
"Enables maximal loop fusion">,
Option<"affineFusionMode", "mode", "enum FusionMode",
"mlir::FusionMode::Greedy", "fusion mode to attempt",
"llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy,"
" \"greedy\", \"Perform greedy (both producer-consumer and sibling) fusion\"), "
"clEnumValN( mlir::FusionMode::ProducerConsumer, "
"\"producer\", \"Perform only producer-consumer fusion\"), "
"clEnumValN( mlir::FusionMode::Sibling, "
"\"sibling\", \"Perform only sibling fusion\"))">,
];
let dependentDialects = ["memref::MemRefDialect"];
}
def AffinePipelineDataTransfer
: Pass<"affine-pipeline-data-transfer", "FuncOp"> {
let summary = "Pipeline non-blocking data transfers between explicitly "
"managed levels of the memory hierarchy";
let description = [{
This pass performs a transformation to overlap non-blocking DMA operations
in a loop with computations through double buffering. This is achieved by
advancing dma_start operations with respect to other operations.
Input
```mlir
func @pipelinedatatransfer() {
%0 = memref.alloc() : memref<256xf32>
%1 = memref.alloc() : memref<32xf32, 1>
%2 = memref.alloc() : memref<1xf32>
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
affine.for %i0 = 0 to 8 {
affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
affine.dma_wait %2[%c0], %c128 : memref<1xf32>
%3 = affine.load %1[%i0] : memref<32xf32, 1>
%4 = "compute"(%3) : (f32) -> f32
affine.store %4, %1[%i0] : memref<32xf32, 1>
}
return
}
```
Output
```mlir
module {
func @pipelinedatatransfer() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%0 = memref.alloc() : memref<256xf32>
%c0_0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%1 = memref.alloc() : memref<2x32xf32, 1>
%2 = memref.alloc() : memref<2x1xf32>
affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
affine.for %arg0 = 1 to 8 {
affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
%8 = affine.apply #map3(%arg0)
%9 = affine.apply #map4(%8)
%10 = affine.apply #map4(%8)
affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
%11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
%12 = "compute"(%11) : (f32) -> f32
affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
}
%3 = affine.apply #map3(%c8)
%4 = affine.apply #map4(%3)
%5 = affine.apply #map4(%3)
affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
%6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
%7 = "compute"(%6) : (f32) -> f32
affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
memref.dealloc %2 : memref<2x1xf32>
memref.dealloc %1 : memref<2x32xf32, 1>
return
}
}
```
}];
let constructor = "mlir::createPipelineDataTransferPass()";
}
def Canonicalizer : Pass<"canonicalize"> {
let summary = "Canonicalize operations";
let description = [{
@@ -339,34 +138,11 @@ def LocationSnapshot : Pass<"snapshot-op-locations"> {
];
}
def LoopCoalescing : Pass<"loop-coalescing", "FuncOp"> {
let summary = "Coalesce nested loops with independent bounds into a single "
"loop";
let constructor = "mlir::createLoopCoalescingPass()";
let dependentDialects = ["arith::ArithmeticDialect"];
}
def LoopInvariantCodeMotion : Pass<"loop-invariant-code-motion"> {
let summary = "Hoist loop invariant instructions outside of the loop";
let constructor = "mlir::createLoopInvariantCodeMotionPass()";
}
def ParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> {
let summary = "Collapse parallel loops to use less induction variables";
let constructor = "mlir::createParallelLoopCollapsingPass()";
let options = [
ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned",
"Which loop indices to combine 0th loop index",
"llvm::cl::MiscFlags::CommaSeparated">,
ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned",
"Which loop indices to combine into the position 1 loop index",
"llvm::cl::MiscFlags::CommaSeparated">,
ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned",
"Which loop indices to combine into the position 2 loop index",
"llvm::cl::MiscFlags::CommaSeparated">,
];
}
def PrintOpStats : Pass<"print-op-stats"> {
let summary = "Print statistics of operations";
let constructor = "mlir::createPrintOpStatsPass()";

View File

@@ -1,200 +0,0 @@
//===- Utils.h - General transformation utilities ---------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This header file defines prototypes for various transformation utilities for
// memref's and non-loop IR structures. These are not passes by themselves but
// are used either by passes, optimization sequences, or in turn by other
// transformation utilities.
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_TRANSFORMS_UTILS_H
#define MLIR_TRANSFORMS_UTILS_H
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/AffineMap.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
namespace mlir {
class AffineApplyOp;
class AffineForOp;
class DominanceInfo;
class Location;
class OpBuilder;
namespace memref {
class AllocOp;
} // namespace memref
/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
/// optionally remapping the old memref's indices using the supplied affine map,
/// `indexRemap`. The new memref could be of a different shape or rank.
/// `extraIndices` provides any additional access indices to be added to the
/// start.
///
/// `indexRemap` remaps indices of the old memref access to a new set of indices
/// that are used to index the memref. Additional input operands to indexRemap
/// can be optionally provided in `extraOperands`, and they occupy the start
/// of its input list. `indexRemap`'s dimensional inputs are expected to
/// correspond to memref's indices, and its symbolic inputs if any should be
/// provided in `symbolOperands`.
///
/// `domOpFilter`, if non-null, restricts the replacement to only those
/// operations that are dominated by the former; similarly, `postDomOpFilter`
/// restricts replacement to only those operations that are postdominated by it.
///
/// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing
/// uses of a memref without any requirement for access index rewrites as long
/// as the user operation has the MemRefsNormalizable trait. The default value
/// of this flag is false.
///
/// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to
/// also be a candidate for replacement. The default value of this flag is
/// false.
///
/// Returns true on success and false if the replacement is not possible,
/// whenever a memref is used as an operand in a non-dereferencing context and
/// 'allowNonDereferencingOps' is false, except for dealloc's on the memref
/// which are left untouched. See comments at function definition for an
/// example.
//
// Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
// The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
// index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
// d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
// extra operands, note that 'indexRemap' would just be applied to existing
// indices (%i, %j).
// TODO: allow extraIndices to be added at any position.
LogicalResult replaceAllMemRefUsesWith(
Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices = {},
AffineMap indexRemap = AffineMap(), ArrayRef<Value> extraOperands = {},
ArrayRef<Value> symbolOperands = {}, Operation *domOpFilter = nullptr,
Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false,
bool replaceInDeallocOp = false);
/// Performs the same replacement as the other version above but only for the
/// dereferencing uses of `oldMemRef` in `op`, except in cases where
/// 'allowNonDereferencingOps' is set to true where we replace the
/// non-dereferencing uses as well.
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
Operation *op,
ArrayRef<Value> extraIndices = {},
AffineMap indexRemap = AffineMap(),
ArrayRef<Value> extraOperands = {},
ArrayRef<Value> symbolOperands = {},
bool allowNonDereferencingOps = false);
/// Rewrites the memref defined by this alloc op to have an identity layout map
/// and updates all its indexing uses. Returns failure if any of its uses
/// escape (while leaving the IR in a valid state).
LogicalResult normalizeMemRef(memref::AllocOp *op);
/// Uses the old memref type map layout and computes the new memref type to have
/// a new shape and a layout map, where the old layout map has been normalized
/// to an identity layout map. It returns the old memref in case no
/// normalization was needed or a failure occurs while transforming the old map
/// layout to an identity layout map.
MemRefType normalizeMemRefType(MemRefType memrefType, OpBuilder builder,
unsigned numSymbolicOperands);
/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
/// its results equal to the number of operands, as a composition
/// of all other AffineApplyOps reachable from input parameter 'operands'. If
/// different operands were drawing results from multiple affine apply ops,
/// these will also be collected into a single (multi-result) affine apply op.
/// The final results of the composed AffineApplyOp are returned in output
/// parameter 'results'. Returns the affine apply op created.
Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc,
ArrayRef<Value> operands,
ArrayRef<Operation *> affineApplyOps,
SmallVectorImpl<Value> *results);
/// Given an operation, inserts one or more single result affine apply
/// operations, results of which are exclusively used by this operation.
/// The operands of these newly created affine apply ops are
/// guaranteed to be loop iterators or terminal symbols of a function.
///
/// Before
///
/// affine.for %i = 0 to #map(%N)
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
/// send %A[%idx], ...
/// %v = "compute"(%idx, ...)
///
/// After
///
/// affine.for %i = 0 to #map(%N)
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
/// send %A[%idx], ...
/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
/// %v = "compute"(%idx_, ...)
/// This allows the application of different transformations on send and
/// compute (for eg. different shifts/delays)
///
/// Fills `sliceOps` with the list of affine.apply operations.
/// In the following cases, `sliceOps` remains empty:
/// 1. If none of opInst's operands were the result of an affine.apply
/// (i.e., there was no affine computation slice to create).
/// 2. If all the affine.apply op's supplying operands to this opInst did not
/// have any uses other than those in this opInst.
void createAffineComputationSlice(Operation *opInst,
SmallVectorImpl<AffineApplyOp> *sliceOps);
/// Given a list of regions, perform control flow sinking on them. For each
/// region, control-flow sinking moves operations that dominate the region but
/// whose only users are in the region into the regions so that they aren't
/// executed on paths where their results are not needed.
///
/// TODO: For the moment, this is a *simple* control-flow sink, i.e., no
/// duplicating of ops. It should be made to accept a cost model to determine
/// whether duplicating a particular op is profitable.
///
/// Example:
///
/// ```mlir
/// %0 = arith.addi %arg0, %arg1
/// scf.if %cond {
/// scf.yield %0
/// } else {
/// scf.yield %arg2
/// }
/// ```
///
/// After control-flow sink:
///
/// ```mlir
/// scf.if %cond {
/// %0 = arith.addi %arg0, %arg1
/// scf.yield %0
/// } else {
/// scf.yield %arg2
/// }
/// ```
///
/// Users must supply a callback `shouldMoveIntoRegion` that determines whether
/// the given operation that only has users in the given operation should be
/// moved into that region.
///
/// Returns the number of operations sunk.
size_t
controlFlowSink(ArrayRef<Region *> regions, DominanceInfo &domInfo,
function_ref<bool(Operation *, Region *)> shouldMoveIntoRegion);
/// Populates `regions` with regions of the provided region branch op that are
/// executed at most once at that are reachable given the current operands of
/// the op. These regions can be passed to `controlFlowSink` to perform sinking
/// on the regions of the operation.
void getSinglyExecutedRegionsToSink(RegionBranchOpInterface branch,
SmallVectorImpl<Region *> &regions);
} // namespace mlir
#endif // MLIR_TRANSFORMS_UTILS_H

View File

@@ -27,7 +27,6 @@
#include "mlir/IR/Builders.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/Sequence.h"

View File

@@ -23,7 +23,6 @@
#include "mlir/IR/PatternMatch.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/Utils.h"
using namespace mlir;
using namespace mlir::scf;

View File

@@ -33,7 +33,6 @@
#include "mlir/Support/MathExtras.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/Utils.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"

View File

@@ -16,7 +16,6 @@
#include "../PassDetail.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/SCF.h"

View File

@@ -22,12 +22,12 @@
#include "PassDetail.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"

View File

@@ -17,13 +17,13 @@
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Builders.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Utils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallPtrSet.h"

View File

@@ -18,10 +18,10 @@
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Dialect/Affine/Passes.h.inc"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/Support/Debug.h"
#include <deque>

View File

@@ -4,9 +4,12 @@ add_mlir_dialect_library(MLIRAffineTransforms
AffineLoopNormalize.cpp
AffineParallelize.cpp
AffineScalarReplacement.cpp
LoopCoalescing.cpp
LoopFusion.cpp
LoopTiling.cpp
LoopUnroll.cpp
LoopUnrollAndJam.cpp
PipelineDataTransfer.cpp
SuperVectorize.cpp
SimplifyAffineStructures.cpp

View File

@@ -8,9 +8,10 @@
#include "PassDetail.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Dialect/SCF/Utils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/Support/Debug.h"

View File

@@ -16,14 +16,14 @@
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopFusionUtils.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Builders.h"
#include "mlir/Transforms/LoopFusionUtils.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/Utils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SetVector.h"

View File

@@ -17,11 +17,11 @@
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Utils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
using namespace mlir;

View File

@@ -12,11 +12,11 @@
#include "PassDetail.h"
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Builders.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"

View File

@@ -37,12 +37,12 @@
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/CommandLine.h"

View File

@@ -9,6 +9,7 @@
#ifndef DIALECT_AFFINE_TRANSFORMS_PASSDETAIL_H_
#define DIALECT_AFFINE_TRANSFORMS_PASSDETAIL_H_
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Pass/Pass.h"
namespace mlir {
@@ -16,6 +17,10 @@ namespace mlir {
template <typename ConcreteDialect>
void registerDialect(DialectRegistry &registry);
namespace arith {
class ArithmeticDialect;
} // namespace arith
namespace linalg {
class LinalgDialect;
} // namespace linalg

View File

@@ -11,17 +11,16 @@
//===----------------------------------------------------------------------===//
#include "PassDetail.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/StandardOps/Utils/Utils.h"
#include "mlir/IR/Builders.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Utils.h"
#include "mlir/Transforms/Passes.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/Debug.h"

View File

@@ -14,9 +14,9 @@
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/IR/IntegerSet.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Utils.h"
#define DEBUG_TYPE "simplify-affine-structure"

View File

@@ -1,4 +1,6 @@
add_mlir_dialect_library(MLIRAffineUtils
LoopFusionUtils.cpp
LoopUtils.cpp
Utils.cpp
ADDITIONAL_HEADER_DIRS
@@ -7,5 +9,6 @@ add_mlir_dialect_library(MLIRAffineUtils
LINK_LIBS PUBLIC
MLIRAffine
MLIRAnalysis
MLIRMemRef
MLIRTransformUtils
)

View File

@@ -10,21 +10,20 @@
//
//===----------------------------------------------------------------------===//
#include "mlir/Transforms/LoopFusionUtils.h"
#include "mlir/Dialect/Affine/LoopFusionUtils.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Operation.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Debug.h"

View File

@@ -10,14 +10,14 @@
//
//===----------------------------------------------------------------------===//
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/IR/BlockAndValueMapping.h"
@@ -25,7 +25,6 @@
#include "mlir/Support/MathExtras.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/RegionUtils.h"
#include "mlir/Transforms/Utils.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Support/Debug.h"
@@ -108,42 +107,9 @@ getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
lb.erase();
}
// Build the IR that performs ceil division of a positive value by a constant:
// ceildiv(a, B) = divis(a + (B-1), B)
// where divis is rounding-to-zero division.
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
int64_t divisor) {
assert(divisor > 0 && "expected positive divisor");
assert(dividend.getType().isIndex() && "expected index-typed value");
Value divisorMinusOneCst =
builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
return builder.create<arith::DivSIOp>(loc, sum, divisorCst);
}
// Build the IR that performs ceil division of a positive value by another
// positive value:
// ceildiv(a, b) = divis(a + (b - 1), b)
// where divis is rounding-to-zero division.
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
Value divisor) {
assert(dividend.getType().isIndex() && "expected index-typed value");
Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
return builder.create<arith::DivSIOp>(loc, sum, divisor);
}
/// Helper to replace uses of loop carried values (iter_args) and loop
/// yield values while promoting single iteration affine.for and scf.for ops.
template <typename AffineOrSCFForOp>
static void replaceIterArgsAndYieldResults(AffineOrSCFForOp forOp) {
static_assert(
llvm::is_one_of<AffineOrSCFForOp, AffineForOp, scf::ForOp>::value,
"only for affine.for and scf.for ops");
/// yield values while promoting single iteration affine.for ops.
static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
// Replace uses of iter arguments with iter operands (initial values).
auto iterOperands = forOp.getIterOperands();
auto iterArgs = forOp.getRegionIterArgs();
@@ -203,46 +169,6 @@ LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
return success();
}
/// Promotes the loop body of a forOp to its containing block if the forOp
/// it can be determined that the loop has a single iteration.
LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) {
auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 ||
ubCstOp.value() < 0 || stepCstOp.value() < 0)
return failure();
int64_t tripCount =
mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value());
if (tripCount != 1)
return failure();
auto iv = forOp.getInductionVar();
iv.replaceAllUsesWith(lbCstOp);
replaceIterArgsAndYieldResults(forOp);
// Move the loop body operations, except for its terminator, to the loop's
// containing block.
auto *parentBlock = forOp->getBlock();
forOp.getBody()->getTerminator()->erase();
parentBlock->getOperations().splice(Block::iterator(forOp),
forOp.getBody()->getOperations());
forOp.erase();
return success();
}
/// Promotes all single iteration 'for' ops in `f`, i.e., moves
/// their body into the containing Block.
void mlir::promoteSingleIterationLoops(FuncOp f) {
// Gathers all innermost loops through a post order pruned walk.
f.walk([](Operation *op) {
if (auto forOp = dyn_cast<AffineForOp>(op))
(void)promoteIfSingleIteration(forOp);
else if (auto forOp = dyn_cast<scf::ForOp>(op))
(void)promoteIfSingleIteration(forOp);
});
}
/// Generates an affine.for op with the specified lower and upper bounds
/// while generating the right IV remappings to realize shifts for operations in
/// its body. The operations that go into the loop body are specified in
@@ -1011,38 +937,22 @@ mlir::tilePerfectlyNestedParametric(MutableArrayRef<AffineForOp> input,
return success();
}
/// Collect perfectly nested loops starting from `rootForOps`. Loops are
/// perfectly nested if each loop is the first and only non-terminator operation
/// in the parent loop. Collect at most `maxLoops` loops and append them to
/// `forOps`.
template <typename T>
static void getPerfectlyNestedLoopsImpl(
SmallVectorImpl<T> &forOps, T rootForOp,
unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
for (unsigned i = 0; i < maxLoops; ++i) {
forOps.push_back(rootForOp);
Block &body = rootForOp.getRegion().front();
if (body.begin() != std::prev(body.end(), 2))
return;
rootForOp = dyn_cast<T>(&body.front());
if (!rootForOp)
return;
}
}
/// Get perfectly nested sequence of loops starting at root of loop nest
/// (the first op being another AffineFor, and the second op - a terminator).
/// A loop is perfectly nested iff: the first op in the loop's body is another
/// AffineForOp, and the second op is a terminator).
void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
AffineForOp root) {
getPerfectlyNestedLoopsImpl(nestedLoops, root);
}
for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
nestedLoops.push_back(root);
Block &body = root.getRegion().front();
if (body.begin() != std::prev(body.end(), 2))
return;
void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
scf::ForOp root) {
getPerfectlyNestedLoopsImpl(nestedLoops, root);
root = dyn_cast<AffineForOp>(&body.front());
if (!root)
return;
}
}
/// Identify valid and profitable bands of loops to tile. This is currently just
@@ -1084,10 +994,10 @@ LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp,
return loopUnrollByFactor(forOp, unrollFactor);
}
/// Generates unrolled copies of AffineForOp or scf::ForOp 'loopBodyBlock', with
/// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
/// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
/// unrolled iteration using annotateFn.
/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
/// unrolled body. If specified, annotates the Ops in each unrolled iteration
/// using annotateFn.
static void generateUnrolledLoop(
Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
@@ -1237,127 +1147,6 @@ LogicalResult mlir::loopUnrollByFactor(
return success();
}
/// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
LogicalResult mlir::loopUnrollByFactor(
scf::ForOp forOp, uint64_t unrollFactor,
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
assert(unrollFactor > 0 && "expected positive unroll factor");
// Return if the loop body is empty.
if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
return success();
// Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
// 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
OpBuilder boundsBuilder(forOp);
auto loc = forOp.getLoc();
auto step = forOp.getStep();
Value upperBoundUnrolled;
Value stepUnrolled;
bool generateEpilogueLoop = true;
auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
if (lbCstOp && ubCstOp && stepCstOp) {
// Constant loop bounds computation.
int64_t lbCst = lbCstOp.value();
int64_t ubCst = ubCstOp.value();
int64_t stepCst = stepCstOp.value();
assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
"expected positive loop bounds and step");
int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
if (unrollFactor == 1) {
if (tripCount == 1 && failed(promoteIfSingleIteration(forOp)))
return failure();
return success();
}
int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
assert(upperBoundUnrolledCst <= ubCst);
int64_t stepUnrolledCst = stepCst * unrollFactor;
// Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
if (generateEpilogueLoop)
upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
loc, upperBoundUnrolledCst);
else
upperBoundUnrolled = ubCstOp;
// Create constant for 'stepUnrolled'.
stepUnrolled = stepCst == stepUnrolledCst
? step
: boundsBuilder.create<arith::ConstantIndexOp>(
loc, stepUnrolledCst);
} else {
// Dynamic loop bounds computation.
// TODO: Add dynamic asserts for negative lb/ub/step, or
// consider using ceilDiv from AffineApplyExpander.
auto lowerBound = forOp.getLowerBound();
auto upperBound = forOp.getUpperBound();
Value diff =
boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
Value unrollFactorCst =
boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
Value tripCountRem =
boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
// Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
Value tripCountEvenMultiple =
boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
// Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
loc, lowerBound,
boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
// Scale 'step' by 'unrollFactor'.
stepUnrolled =
boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
}
// Create epilogue clean up loop starting at 'upperBoundUnrolled'.
if (generateEpilogueLoop) {
OpBuilder epilogueBuilder(forOp->getContext());
epilogueBuilder.setInsertionPoint(forOp->getBlock(),
std::next(Block::iterator(forOp)));
auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
epilogueForOp.setLowerBound(upperBoundUnrolled);
// Update uses of loop results.
auto results = forOp.getResults();
auto epilogueResults = epilogueForOp.getResults();
auto epilogueIterOperands = epilogueForOp.getIterOperands();
for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) {
std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
}
(void)promoteIfSingleIteration(epilogueForOp);
}
// Create unrolled loop.
forOp.setUpperBound(upperBoundUnrolled);
forOp.setStep(stepUnrolled);
auto iterArgs = ValueRange(forOp.getRegionIterArgs());
auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
generateUnrolledLoop(
forOp.getBody(), forOp.getInductionVar(), unrollFactor,
[&](unsigned i, Value iv, OpBuilder b) {
// iv' = iv + step * i;
auto stride = b.create<arith::MulIOp>(
loc, step, b.create<arith::ConstantIndexOp>(loc, i));
return b.create<arith::AddIOp>(loc, iv, stride);
},
annotateFn, iterArgs, yieldedValues);
// Promote the loop body up if this has turned into a single iteration loop.
(void)promoteIfSingleIteration(forOp);
return success();
}
LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp,
uint64_t unrollJamFactor) {
Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
@@ -1888,61 +1677,25 @@ stripmineSink(AffineForOp forOp, uint64_t factor,
return innerLoops;
}
static Loops stripmineSink(scf::ForOp forOp, Value factor,
ArrayRef<scf::ForOp> targets) {
auto originalStep = forOp.getStep();
auto iv = forOp.getInductionVar();
OpBuilder b(forOp);
forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
Loops innerLoops;
for (auto t : targets) {
// Save information for splicing ops out of t when done
auto begin = t.getBody()->begin();
auto nOps = t.getBody()->getOperations().size();
// Insert newForOp before the terminator of `t`.
auto b = OpBuilder::atBlockTerminator((t.getBody()));
Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt,
forOp.getUpperBound(), stepped);
Value ub =
b.create<SelectOp>(t.getLoc(), less, forOp.getUpperBound(), stepped);
// Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
newForOp.getBody()->getOperations().splice(
newForOp.getBody()->getOperations().begin(),
t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
newForOp.getRegion());
innerLoops.push_back(newForOp);
}
return innerLoops;
}
// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
// Returns the new AffineForOps, nested immediately under `target`.
template <typename ForType, typename SizeType>
static ForType stripmineSink(ForType forOp, SizeType factor, ForType target) {
template <typename SizeType>
static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
AffineForOp target) {
// TODO: Use cheap structural assertions that targets are nested under
// forOp and that targets are not nested under each other when DominanceInfo
// exposes the capability. It seems overkill to construct a whole function
// dominance tree at this point.
auto res = stripmineSink(forOp, factor, ArrayRef<ForType>{target});
auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
assert(res.size() == 1 && "Expected 1 inner forOp");
return res[0];
}
template <typename ForType, typename SizeType>
static SmallVector<SmallVector<ForType, 8>, 8>
tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes,
ArrayRef<ForType> targets) {
SmallVector<SmallVector<ForType, 8>, 8> res;
SmallVector<ForType, 8> currentTargets(targets.begin(), targets.end());
SmallVector<SmallVector<AffineForOp, 8>, 8>
mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
ArrayRef<AffineForOp> targets) {
SmallVector<SmallVector<AffineForOp, 8>, 8> res;
SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end());
for (auto it : llvm::zip(forOps, sizes)) {
auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
res.push_back(step);
@@ -1951,288 +1704,17 @@ tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes,
return res;
}
SmallVector<SmallVector<AffineForOp, 8>, 8>
mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
ArrayRef<AffineForOp> targets) {
return tileImpl(forOps, sizes, targets);
}
SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,
ArrayRef<Value> sizes,
ArrayRef<scf::ForOp> targets) {
return tileImpl(forOps, sizes, targets);
}
template <typename ForType, typename SizeType>
static SmallVector<ForType, 8>
tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes, ForType target) {
SmallVector<ForType, 8> res;
for (auto loops : tile(forOps, sizes, ArrayRef<ForType>{target})) {
SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
ArrayRef<uint64_t> sizes,
AffineForOp target) {
SmallVector<AffineForOp, 8> res;
for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) {
assert(loops.size() == 1);
res.push_back(loops[0]);
}
return res;
}
SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
ArrayRef<uint64_t> sizes,
AffineForOp target) {
return tileImpl(forOps, sizes, target);
}
Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
scf::ForOp target) {
return tileImpl(forOps, sizes, target);
}
Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
// Collect perfectly nested loops. If more size values provided than nested
// loops available, truncate `sizes`.
SmallVector<scf::ForOp, 4> forOps;
forOps.reserve(sizes.size());
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
if (forOps.size() < sizes.size())
sizes = sizes.take_front(forOps.size());
return ::tile(forOps, sizes, forOps.back());
}
// Hoist the ops within `outer` that appear before `inner`.
// Such ops include the ops that have been introduced by parametric tiling.
// Ops that come from triangular loops (i.e. that belong to the program slice
// rooted at `outer`) and ops that have side effects cannot be hoisted.
// Return failure when any op fails to hoist.
static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
SetVector<Operation *> forwardSlice;
getForwardSlice(
outer.getInductionVar(), &forwardSlice,
[&inner](Operation *op) { return op != inner.getOperation(); });
LogicalResult status = success();
SmallVector<Operation *, 8> toHoist;
for (auto &op : outer.getBody()->without_terminator()) {
// Stop when encountering the inner loop.
if (&op == inner.getOperation())
break;
// Skip over non-hoistable ops.
if (forwardSlice.count(&op) > 0) {
status = failure();
continue;
}
// Skip intermediate scf::ForOp, these are not considered a failure.
if (isa<scf::ForOp>(op))
continue;
// Skip other ops with regions.
if (op.getNumRegions() > 0) {
status = failure();
continue;
}
// Skip if op has side effects.
// TODO: loads to immutable memory regions are ok.
if (!MemoryEffectOpInterface::hasNoEffect(&op)) {
status = failure();
continue;
}
toHoist.push_back(&op);
}
auto *outerForOp = outer.getOperation();
for (auto *op : toHoist)
op->moveBefore(outerForOp);
return status;
}
// Traverse the interTile and intraTile loops and try to hoist ops such that
// bands of perfectly nested loops are isolated.
// Return failure if either perfect interTile or perfect intraTile bands cannot
// be formed.
static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
LogicalResult status = success();
const Loops &interTile = tileLoops.first;
const Loops &intraTile = tileLoops.second;
auto size = interTile.size();
assert(size == intraTile.size());
if (size <= 1)
return success();
for (unsigned s = 1; s < size; ++s)
status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
: failure();
for (unsigned s = 1; s < size; ++s)
status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
: failure();
return status;
}
TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
ArrayRef<int64_t> sizes) {
// Collect perfectly nested loops. If more size values provided than nested
// loops available, truncate `sizes`.
SmallVector<scf::ForOp, 4> forOps;
forOps.reserve(sizes.size());
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
if (forOps.size() < sizes.size())
sizes = sizes.take_front(forOps.size());
// Compute the tile sizes such that i-th outer loop executes size[i]
// iterations. Given that the loop current executes
// numIterations = ceildiv((upperBound - lowerBound), step)
// iterations, we need to tile with size ceildiv(numIterations, size[i]).
SmallVector<Value, 4> tileSizes;
tileSizes.reserve(sizes.size());
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
auto forOp = forOps[i];
OpBuilder builder(forOp);
auto loc = forOp.getLoc();
Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
forOp.getLowerBound());
Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
Value iterationsPerBlock =
ceilDivPositive(builder, loc, numIterations, sizes[i]);
tileSizes.push_back(iterationsPerBlock);
}
// Call parametric tiling with the given sizes.
auto intraTile = tile(forOps, tileSizes, forOps.back());
TileLoops tileLoops = std::make_pair(forOps, intraTile);
// TODO: for now we just ignore the result of band isolation.
// In the future, mapping decisions may be impacted by the ability to
// isolate perfectly nested bands.
(void)tryIsolateBands(tileLoops);
return tileLoops;
}
/// Return the new lower bound, upper bound, and step in that order. Insert any
/// additional bounds calculations before the given builder and any additional
/// conversion back to the original loop induction value inside the given Block.
static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
OpBuilder &insideLoopBuilder, Location loc,
Value lowerBound, Value upperBound, Value step,
Value inductionVar) {
// Check if the loop is already known to have a constant zero lower bound or
// a constant one step.
bool isZeroBased = false;
if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>())
isZeroBased = ubCst.value() == 0;
bool isStepOne = false;
if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>())
isStepOne = stepCst.value() == 1;
// Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
// assuming the step is strictly positive. Update the bounds and the step
// of the loop to go from 0 to the number of iterations, if necessary.
// TODO: introduce support for negative steps or emit dynamic asserts
// on step positivity, whatever gets implemented first.
if (isZeroBased && isStepOne)
return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
/*step=*/step};
Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
Value newLowerBound =
isZeroBased ? lowerBound
: boundsBuilder.create<arith::ConstantIndexOp>(loc, 0);
Value newStep =
isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1);
// Insert code computing the value of the original loop induction variable
// from the "normalized" one.
Value scaled =
isStepOne
? inductionVar
: insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
Value shifted =
isZeroBased
? scaled
: insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
shifted.getDefiningOp()};
inductionVar.replaceAllUsesExcept(shifted, preserve);
return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
/*step=*/newStep};
}
/// Transform a loop with a strictly positive step
/// for %i = %lb to %ub step %s
/// into a 0-based loop with step 1
/// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
/// %i = %ii * %s + %lb
/// Insert the induction variable remapping in the body of `inner`, which is
/// expected to be either `loop` or another loop perfectly nested under `loop`.
/// Insert the definition of new bounds immediate before `outer`, which is
/// expected to be either `loop` or its parent in the loop nest.
static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
OpBuilder builder(outer);
OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
loop.getLowerBound(), loop.getUpperBound(),
loop.getStep(), loop.getInductionVar());
loop.setLowerBound(loopPieces.lowerBound);
loop.setUpperBound(loopPieces.upperBound);
loop.setStep(loopPieces.step);
}
void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
if (loops.size() < 2)
return;
scf::ForOp innermost = loops.back();
scf::ForOp outermost = loops.front();
// 1. Make sure all loops iterate from 0 to upperBound with step 1. This
// allows the following code to assume upperBound is the number of iterations.
for (auto loop : loops)
normalizeLoop(loop, outermost, innermost);
// 2. Emit code computing the upper bound of the coalesced loop as product
// of the number of iterations of all loops.
OpBuilder builder(outermost);
Location loc = outermost.getLoc();
Value upperBound = outermost.getUpperBound();
for (auto loop : loops.drop_front())
upperBound =
builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
outermost.setUpperBound(upperBound);
builder.setInsertionPointToStart(outermost.getBody());
// 3. Remap induction variables. For each original loop, the value of the
// induction variable can be obtained by dividing the induction variable of
// the linearized loop by the total number of iterations of the loops nested
// in it modulo the number of iterations in this loop (remove the values
// related to the outer loops):
// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
// Compute these iteratively from the innermost loop by creating a "running
// quotient" of division by the range.
Value previous = outermost.getInductionVar();
for (unsigned i = 0, e = loops.size(); i < e; ++i) {
unsigned idx = loops.size() - i - 1;
if (i != 0)
previous = builder.create<arith::DivSIOp>(loc, previous,
loops[idx + 1].getUpperBound());
Value iv = (i == e - 1) ? previous
: builder.create<arith::RemSIOp>(
loc, previous, loops[idx].getUpperBound());
replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
loops.back().getRegion());
}
// 4. Move the operations from the innermost just above the second-outermost
// loop, delete the extra terminator and the second-outermost loop.
scf::ForOp second = loops[1];
innermost.getBody()->back().erase();
outermost.getBody()->getOperations().splice(
Block::iterator(second.getOperation()),
innermost.getBody()->getOperations());
second.erase();
}
LogicalResult mlir::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
if (loops.size() < 2)
return success();
@@ -2347,89 +1829,6 @@ LogicalResult mlir::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
return success();
}
void mlir::collapseParallelLoops(
scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
OpBuilder outsideBuilder(loops);
Location loc = loops.getLoc();
// Presort combined dimensions.
auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
for (auto &dims : sortedDimensions)
std::sort(dims.begin(), dims.end());
// Normalize ParallelOp's iteration pattern.
SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
normalizedUpperBounds;
for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
auto resultBounds =
normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
loops.getLowerBound()[i], loops.getUpperBound()[i],
loops.getStep()[i], loops.getBody()->getArgument(i));
normalizedLowerBounds.push_back(resultBounds.lowerBound);
normalizedUpperBounds.push_back(resultBounds.upperBound);
normalizedSteps.push_back(resultBounds.step);
}
// Combine iteration spaces.
SmallVector<Value, 3> lowerBounds, upperBounds, steps;
auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
for (auto idx : sortedDimensions[i]) {
newUpperBound = outsideBuilder.create<arith::MulIOp>(
loc, newUpperBound, normalizedUpperBounds[idx]);
}
lowerBounds.push_back(cst0);
steps.push_back(cst1);
upperBounds.push_back(newUpperBound);
}
// Create new ParallelLoop with conversions to the original induction values.
// The loop below uses divisions to get the relevant range of values in the
// new induction value that represent each range of the original induction
// value. The remainders then determine based on that range, which iteration
// of the original induction value this represents. This is a normalized value
// that is un-normalized already by the previous logic.
auto newPloop = outsideBuilder.create<scf::ParallelOp>(
loc, lowerBounds, upperBounds, steps,
[&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
Value previous = ploopIVs[i];
unsigned numberCombinedDimensions = combinedDimensions[i].size();
// Iterate over all except the last induction value.
for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
unsigned idx = combinedDimensions[i][j];
// Determine the current induction value's current loop iteration
Value iv = insideBuilder.create<arith::RemSIOp>(
loc, previous, normalizedUpperBounds[idx]);
replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
loops.getRegion());
// Remove the effect of the current induction value to prepare for
// the next value.
previous = insideBuilder.create<arith::DivSIOp>(
loc, previous, normalizedUpperBounds[idx]);
}
// The final induction value is just the remaining value.
unsigned idx = combinedDimensions[i][0];
replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
previous, loops.getRegion());
}
});
// Replace the old loop with the new loop.
loops.getBody()->back().erase();
newPloop.getBody()->getOperations().splice(
Block::iterator(newPloop.getBody()->back()),
loops.getBody()->getOperations());
loops.erase();
}
void mlir::mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId,
ArrayRef<Value> numProcessors) {
assert(processorId.size() == numProcessors.size());

View File

@@ -16,12 +16,14 @@
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Dominance.h"
#include "mlir/IR/IntegerSet.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/LoopUtils.h"
#define DEBUG_TYPE "affine-utils"
using namespace mlir;
@@ -856,3 +858,740 @@ void mlir::affineScalarReplace(FuncOp f, DominanceInfo &domInfo,
defOp->erase();
}
}
// Perform the replacement in `op`.
LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
Operation *op,
ArrayRef<Value> extraIndices,
AffineMap indexRemap,
ArrayRef<Value> extraOperands,
ArrayRef<Value> symbolOperands,
bool allowNonDereferencingOps) {
unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
(void)newMemRefRank; // unused in opt mode
unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
(void)oldMemRefRank; // unused in opt mode
if (indexRemap) {
assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
"symbolic operand count mismatch");
assert(indexRemap.getNumInputs() ==
extraOperands.size() + oldMemRefRank + symbolOperands.size());
assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
} else {
assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
}
// Assert same elemental type.
assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
newMemRef.getType().cast<MemRefType>().getElementType());
SmallVector<unsigned, 2> usePositions;
for (const auto &opEntry : llvm::enumerate(op->getOperands())) {
if (opEntry.value() == oldMemRef)
usePositions.push_back(opEntry.index());
}
// If memref doesn't appear, nothing to do.
if (usePositions.empty())
return success();
if (usePositions.size() > 1) {
// TODO: extend it for this case when needed (rare).
assert(false && "multiple dereferencing uses in a single op not supported");
return failure();
}
unsigned memRefOperandPos = usePositions.front();
OpBuilder builder(op);
// The following checks if op is dereferencing memref and performs the access
// index rewrites.
auto affMapAccInterface = dyn_cast<AffineMapAccessInterface>(op);
if (!affMapAccInterface) {
if (!allowNonDereferencingOps) {
// Failure: memref used in a non-dereferencing context (potentially
// escapes); no replacement in these cases unless allowNonDereferencingOps
// is set.
return failure();
}
op->setOperand(memRefOperandPos, newMemRef);
return success();
}
// Perform index rewrites for the dereferencing op and then replace the op
NamedAttribute oldMapAttrPair =
affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef);
AffineMap oldMap = oldMapAttrPair.getValue().cast<AffineMapAttr>().getValue();
unsigned oldMapNumInputs = oldMap.getNumInputs();
SmallVector<Value, 4> oldMapOperands(
op->operand_begin() + memRefOperandPos + 1,
op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs);
// Apply 'oldMemRefOperands = oldMap(oldMapOperands)'.
SmallVector<Value, 4> oldMemRefOperands;
SmallVector<Value, 4> affineApplyOps;
oldMemRefOperands.reserve(oldMemRefRank);
if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) {
for (auto resultExpr : oldMap.getResults()) {
auto singleResMap = AffineMap::get(oldMap.getNumDims(),
oldMap.getNumSymbols(), resultExpr);
auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
oldMapOperands);
oldMemRefOperands.push_back(afOp);
affineApplyOps.push_back(afOp);
}
} else {
oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end());
}
// Construct new indices as a remap of the old ones if a remapping has been
// provided. The indices of a memref come right after it, i.e.,
// at position memRefOperandPos + 1.
SmallVector<Value, 4> remapOperands;
remapOperands.reserve(extraOperands.size() + oldMemRefRank +
symbolOperands.size());
remapOperands.append(extraOperands.begin(), extraOperands.end());
remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end());
remapOperands.append(symbolOperands.begin(), symbolOperands.end());
SmallVector<Value, 4> remapOutputs;
remapOutputs.reserve(oldMemRefRank);
if (indexRemap &&
indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
// Remapped indices.
for (auto resultExpr : indexRemap.getResults()) {
auto singleResMap = AffineMap::get(
indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr);
auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
remapOperands);
remapOutputs.push_back(afOp);
affineApplyOps.push_back(afOp);
}
} else {
// No remapping specified.
remapOutputs.assign(remapOperands.begin(), remapOperands.end());
}
SmallVector<Value, 4> newMapOperands;
newMapOperands.reserve(newMemRefRank);
// Prepend 'extraIndices' in 'newMapOperands'.
for (Value extraIndex : extraIndices) {
assert(extraIndex.getDefiningOp()->getNumResults() == 1 &&
"single result op's expected to generate these indices");
assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) &&
"invalid memory op index");
newMapOperands.push_back(extraIndex);
}
// Append 'remapOutputs' to 'newMapOperands'.
newMapOperands.append(remapOutputs.begin(), remapOutputs.end());
// Create new fully composed AffineMap for new op to be created.
assert(newMapOperands.size() == newMemRefRank);
auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
// TODO: Avoid creating/deleting temporary AffineApplyOps here.
fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
newMap = simplifyAffineMap(newMap);
canonicalizeMapAndOperands(&newMap, &newMapOperands);
// Remove any affine.apply's that became dead as a result of composition.
for (Value value : affineApplyOps)
if (value.use_empty())
value.getDefiningOp()->erase();
OperationState state(op->getLoc(), op->getName());
// Construct the new operation using this memref.
state.operands.reserve(op->getNumOperands() + extraIndices.size());
// Insert the non-memref operands.
state.operands.append(op->operand_begin(),
op->operand_begin() + memRefOperandPos);
// Insert the new memref value.
state.operands.push_back(newMemRef);
// Insert the new memref map operands.
state.operands.append(newMapOperands.begin(), newMapOperands.end());
// Insert the remaining operands unmodified.
state.operands.append(op->operand_begin() + memRefOperandPos + 1 +
oldMapNumInputs,
op->operand_end());
// Result types don't change. Both memref's are of the same elemental type.
state.types.reserve(op->getNumResults());
for (auto result : op->getResults())
state.types.push_back(result.getType());
// Add attribute for 'newMap', other Attributes do not change.
auto newMapAttr = AffineMapAttr::get(newMap);
for (auto namedAttr : op->getAttrs()) {
if (namedAttr.getName() == oldMapAttrPair.getName())
state.attributes.push_back({namedAttr.getName(), newMapAttr});
else
state.attributes.push_back(namedAttr);
}
// Create the new operation.
auto *repOp = builder.createOperation(state);
op->replaceAllUsesWith(repOp);
op->erase();
return success();
}
LogicalResult mlir::replaceAllMemRefUsesWith(
Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices,
AffineMap indexRemap, ArrayRef<Value> extraOperands,
ArrayRef<Value> symbolOperands, Operation *domOpFilter,
Operation *postDomOpFilter, bool allowNonDereferencingOps,
bool replaceInDeallocOp) {
unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
(void)newMemRefRank; // unused in opt mode
unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
(void)oldMemRefRank;
if (indexRemap) {
assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
"symbol operand count mismatch");
assert(indexRemap.getNumInputs() ==
extraOperands.size() + oldMemRefRank + symbolOperands.size());
assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
} else {
assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
}
// Assert same elemental type.
assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
newMemRef.getType().cast<MemRefType>().getElementType());
std::unique_ptr<DominanceInfo> domInfo;
std::unique_ptr<PostDominanceInfo> postDomInfo;
if (domOpFilter)
domInfo =
std::make_unique<DominanceInfo>(domOpFilter->getParentOfType<FuncOp>());
if (postDomOpFilter)
postDomInfo = std::make_unique<PostDominanceInfo>(
postDomOpFilter->getParentOfType<FuncOp>());
// Walk all uses of old memref; collect ops to perform replacement. We use a
// DenseSet since an operation could potentially have multiple uses of a
// memref (although rare), and the replacement later is going to erase ops.
DenseSet<Operation *> opsToReplace;
for (auto *op : oldMemRef.getUsers()) {
// Skip this use if it's not dominated by domOpFilter.
if (domOpFilter && !domInfo->dominates(domOpFilter, op))
continue;
// Skip this use if it's not post-dominated by postDomOpFilter.
if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op))
continue;
// Skip dealloc's - no replacement is necessary, and a memref replacement
// at other uses doesn't hurt these dealloc's.
if (isa<memref::DeallocOp>(op) && !replaceInDeallocOp)
continue;
// Check if the memref was used in a non-dereferencing context. It is fine
// for the memref to be used in a non-dereferencing way outside of the
// region where this replacement is happening.
if (!isa<AffineMapAccessInterface>(*op)) {
if (!allowNonDereferencingOps) {
LLVM_DEBUG(llvm::dbgs()
<< "Memref replacement failed: non-deferencing memref op: \n"
<< *op << '\n');
return failure();
}
// Non-dereferencing ops with the MemRefsNormalizable trait are
// supported for replacement.
if (!op->hasTrait<OpTrait::MemRefsNormalizable>()) {
LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a "
"memrefs normalizable trait: \n"
<< *op << '\n');
return failure();
}
}
// We'll first collect and then replace --- since replacement erases the op
// that has the use, and that op could be postDomFilter or domFilter itself!
opsToReplace.insert(op);
}
for (auto *op : opsToReplace) {
if (failed(replaceAllMemRefUsesWith(
oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands,
symbolOperands, allowNonDereferencingOps)))
llvm_unreachable("memref replacement guaranteed to succeed here");
}
return success();
}
/// Given an operation, inserts one or more single result affine
/// apply operations, results of which are exclusively used by this operation
/// operation. The operands of these newly created affine apply ops are
/// guaranteed to be loop iterators or terminal symbols of a function.
///
/// Before
///
/// affine.for %i = 0 to #map(%N)
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
/// "send"(%idx, %A, ...)
/// "compute"(%idx)
///
/// After
///
/// affine.for %i = 0 to #map(%N)
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
/// "send"(%idx, %A, ...)
/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
/// "compute"(%idx_)
///
/// This allows applying different transformations on send and compute (for eg.
/// different shifts/delays).
///
/// Returns nullptr either if none of opInst's operands were the result of an
/// affine.apply and thus there was no affine computation slice to create, or if
/// all the affine.apply op's supplying operands to this opInst did not have any
/// uses besides this opInst; otherwise returns the list of affine.apply
/// operations created in output argument `sliceOps`.
void mlir::createAffineComputationSlice(
Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) {
// Collect all operands that are results of affine apply ops.
SmallVector<Value, 4> subOperands;
subOperands.reserve(opInst->getNumOperands());
for (auto operand : opInst->getOperands())
if (isa_and_nonnull<AffineApplyOp>(operand.getDefiningOp()))
subOperands.push_back(operand);
// Gather sequence of AffineApplyOps reachable from 'subOperands'.
SmallVector<Operation *, 4> affineApplyOps;
getReachableAffineApplyOps(subOperands, affineApplyOps);
// Skip transforming if there are no affine maps to compose.
if (affineApplyOps.empty())
return;
// Check if all uses of the affine apply op's lie only in this op op, in
// which case there would be nothing to do.
bool localized = true;
for (auto *op : affineApplyOps) {
for (auto result : op->getResults()) {
for (auto *user : result.getUsers()) {
if (user != opInst) {
localized = false;
break;
}
}
}
}
if (localized)
return;
OpBuilder builder(opInst);
SmallVector<Value, 4> composedOpOperands(subOperands);
auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
// Create an affine.apply for each of the map results.
sliceOps->reserve(composedMap.getNumResults());
for (auto resultExpr : composedMap.getResults()) {
auto singleResMap = AffineMap::get(composedMap.getNumDims(),
composedMap.getNumSymbols(), resultExpr);
sliceOps->push_back(builder.create<AffineApplyOp>(
opInst->getLoc(), singleResMap, composedOpOperands));
}
// Construct the new operands that include the results from the composed
// affine apply op above instead of existing ones (subOperands). So, they
// differ from opInst's operands only for those operands in 'subOperands', for
// which they will be replaced by the corresponding one from 'sliceOps'.
SmallVector<Value, 4> newOperands(opInst->getOperands());
for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
// Replace the subOperands from among the new operands.
unsigned j, f;
for (j = 0, f = subOperands.size(); j < f; j++) {
if (newOperands[i] == subOperands[j])
break;
}
if (j < subOperands.size()) {
newOperands[i] = (*sliceOps)[j];
}
}
for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
opInst->setOperand(idx, newOperands[idx]);
}
}
/// Enum to set patterns of affine expr in tiled-layout map.
/// TileFloorDiv: <dim expr> div <tile size>
/// TileMod: <dim expr> mod <tile size>
/// TileNone: None of the above
/// Example:
/// #tiled_2d_128x256 = affine_map<(d0, d1)
/// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
/// "d0 div 128" and "d1 div 256" ==> TileFloorDiv
/// "d0 mod 128" and "d1 mod 256" ==> TileMod
enum TileExprPattern { TileFloorDiv, TileMod, TileNone };
/// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions
/// being floordiv'ed by respective tile sizes appeare in a mod with the same
/// tile sizes, and no other expression involves those k dimensions. This
/// function stores a vector of tuples (`tileSizePos`) including AffineExpr for
/// tile size, positions of corresponding `floordiv` and `mod`. If it is not a
/// tiled layout, an empty vector is returned.
static LogicalResult getTileSizePos(
AffineMap map,
SmallVectorImpl<std::tuple<AffineExpr, unsigned, unsigned>> &tileSizePos) {
// Create `floordivExprs` which is a vector of tuples including LHS and RHS of
// `floordiv` and its position in `map` output.
// Example: #tiled_2d_128x256 = affine_map<(d0, d1)
// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
// In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}.
SmallVector<std::tuple<AffineExpr, AffineExpr, unsigned>, 4> floordivExprs;
unsigned pos = 0;
for (AffineExpr expr : map.getResults()) {
if (expr.getKind() == AffineExprKind::FloorDiv) {
AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
if (binaryExpr.getRHS().isa<AffineConstantExpr>())
floordivExprs.emplace_back(
std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos));
}
pos++;
}
// Not tiled layout if `floordivExprs` is empty.
if (floordivExprs.empty()) {
tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
return success();
}
// Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is
// not tiled layout.
for (std::tuple<AffineExpr, AffineExpr, unsigned> fexpr : floordivExprs) {
AffineExpr floordivExprLHS = std::get<0>(fexpr);
AffineExpr floordivExprRHS = std::get<1>(fexpr);
unsigned floordivPos = std::get<2>(fexpr);
// Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS
// of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used
// other expr, the map is not tiled layout. Example of non tiled layout:
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)>
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)>
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod
// 256)>
bool found = false;
pos = 0;
for (AffineExpr expr : map.getResults()) {
bool notTiled = false;
if (pos != floordivPos) {
expr.walk([&](AffineExpr e) {
if (e == floordivExprLHS) {
if (expr.getKind() == AffineExprKind::Mod) {
AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
// If LHS and RHS of `mod` are the same with those of floordiv.
if (floordivExprLHS == binaryExpr.getLHS() &&
floordivExprRHS == binaryExpr.getRHS()) {
// Save tile size (RHS of `mod`), and position of `floordiv` and
// `mod` if same expr with `mod` is not found yet.
if (!found) {
tileSizePos.emplace_back(
std::make_tuple(binaryExpr.getRHS(), floordivPos, pos));
found = true;
} else {
// Non tiled layout: Have multilpe `mod` with the same LHS.
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
// mod 256, d2 mod 256)>
notTiled = true;
}
} else {
// Non tiled layout: RHS of `mod` is different from `floordiv`.
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
// mod 128)>
notTiled = true;
}
} else {
// Non tiled layout: LHS is the same, but not `mod`.
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
// floordiv 256)>
notTiled = true;
}
}
});
}
if (notTiled) {
tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
return success();
}
pos++;
}
}
return success();
}
/// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic
/// after normalization. Dimensions that include dynamic dimensions in the map
/// output will become dynamic dimensions. Return true if `dim` is dynamic
/// dimension.
///
/// Example:
/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
///
/// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic.
/// memref<4x?xf32, #map0> ==> memref<4x?x?xf32>
static bool
isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap,
SmallVectorImpl<unsigned> &inMemrefTypeDynDims,
MLIRContext *context) {
bool isDynamicDim = false;
AffineExpr expr = layoutMap.getResults()[dim];
// Check if affine expr of the dimension includes dynamic dimension of input
// memrefType.
expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) {
if (e.isa<AffineDimExpr>()) {
for (unsigned dm : inMemrefTypeDynDims) {
if (e == getAffineDimExpr(dm, context)) {
isDynamicDim = true;
}
}
}
});
return isDynamicDim;
}
/// Create affine expr to calculate dimension size for a tiled-layout map.
static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput,
TileExprPattern pat) {
// Create map output for the patterns.
// "floordiv <tile size>" ==> "ceildiv <tile size>"
// "mod <tile size>" ==> "<tile size>"
AffineExpr newMapOutput;
AffineBinaryOpExpr binaryExpr = nullptr;
switch (pat) {
case TileExprPattern::TileMod:
binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
newMapOutput = binaryExpr.getRHS();
break;
case TileExprPattern::TileFloorDiv:
binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
newMapOutput = getAffineBinaryOpExpr(
AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS());
break;
default:
newMapOutput = oldMapOutput;
}
return newMapOutput;
}
/// Create new maps to calculate each dimension size of `newMemRefType`, and
/// create `newDynamicSizes` from them by using AffineApplyOp.
///
/// Steps for normalizing dynamic memrefs for a tiled layout map
/// Example:
/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
/// %0 = dim %arg0, %c1 :memref<4x?xf32>
/// %1 = alloc(%0) : memref<4x?xf32, #map0>
///
/// (Before this function)
/// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only
/// single layout map is supported.
///
/// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It
/// is memref<4x?x?xf32> in the above example.
///
/// (In this function)
/// 3. Create new maps to calculate each dimension of the normalized memrefType
/// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the
/// dimension size can be calculated by replacing "floordiv <tile size>" with
/// "ceildiv <tile size>" and "mod <tile size>" with "<tile size>".
/// - New map in the above example
/// #map0 = affine_map<(d0, d1) -> (d0)>
/// #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)>
/// #map2 = affine_map<(d0, d1) -> (32)>
///
/// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp
/// is used in dynamicSizes of new AllocOp.
/// %0 = dim %arg0, %c1 : memref<4x?xf32>
/// %c4 = arith.constant 4 : index
/// %1 = affine.apply #map1(%c4, %0)
/// %2 = affine.apply #map2(%c4, %0)
static void createNewDynamicSizes(MemRefType oldMemRefType,
MemRefType newMemRefType, AffineMap map,
memref::AllocOp *allocOp, OpBuilder b,
SmallVectorImpl<Value> &newDynamicSizes) {
// Create new input for AffineApplyOp.
SmallVector<Value, 4> inAffineApply;
ArrayRef<int64_t> oldMemRefShape = oldMemRefType.getShape();
unsigned dynIdx = 0;
for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) {
if (oldMemRefShape[d] < 0) {
// Use dynamicSizes of allocOp for dynamic dimension.
inAffineApply.emplace_back(allocOp->dynamicSizes()[dynIdx]);
dynIdx++;
} else {
// Create ConstantOp for static dimension.
Attribute constantAttr =
b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]);
inAffineApply.emplace_back(
b.create<arith::ConstantOp>(allocOp->getLoc(), constantAttr));
}
}
// Create new map to calculate each dimension size of new memref for each
// original map output. Only for dynamic dimesion of `newMemRefType`.
unsigned newDimIdx = 0;
ArrayRef<int64_t> newMemRefShape = newMemRefType.getShape();
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
(void)getTileSizePos(map, tileSizePos);
for (AffineExpr expr : map.getResults()) {
if (newMemRefShape[newDimIdx] < 0) {
// Create new maps to calculate each dimension size of new memref.
enum TileExprPattern pat = TileExprPattern::TileNone;
for (auto pos : tileSizePos) {
if (newDimIdx == std::get<1>(pos))
pat = TileExprPattern::TileFloorDiv;
else if (newDimIdx == std::get<2>(pos))
pat = TileExprPattern::TileMod;
}
AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat);
AffineMap newMap =
AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput);
Value affineApp =
b.create<AffineApplyOp>(allocOp->getLoc(), newMap, inAffineApply);
newDynamicSizes.emplace_back(affineApp);
}
newDimIdx++;
}
}
// TODO: Currently works for static memrefs with a single layout map.
LogicalResult mlir::normalizeMemRef(memref::AllocOp *allocOp) {
MemRefType memrefType = allocOp->getType();
OpBuilder b(*allocOp);
// Fetch a new memref type after normalizing the old memref to have an
// identity map layout.
MemRefType newMemRefType =
normalizeMemRefType(memrefType, b, allocOp->symbolOperands().size());
if (newMemRefType == memrefType)
// Either memrefType already had an identity map or the map couldn't be
// transformed to an identity map.
return failure();
Value oldMemRef = allocOp->getResult();
SmallVector<Value, 4> symbolOperands(allocOp->symbolOperands());
AffineMap layoutMap = memrefType.getLayout().getAffineMap();
memref::AllocOp newAlloc;
// Check if `layoutMap` is a tiled layout. Only single layout map is
// supported for normalizing dynamic memrefs.
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
(void)getTileSizePos(layoutMap, tileSizePos);
if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) {
MemRefType oldMemRefType = oldMemRef.getType().cast<MemRefType>();
SmallVector<Value, 4> newDynamicSizes;
createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b,
newDynamicSizes);
// Add the new dynamic sizes in new AllocOp.
newAlloc =
b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
newDynamicSizes, allocOp->alignmentAttr());
} else {
newAlloc = b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
allocOp->alignmentAttr());
}
// Replace all uses of the old memref.
if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc,
/*extraIndices=*/{},
/*indexRemap=*/layoutMap,
/*extraOperands=*/{},
/*symbolOperands=*/symbolOperands,
/*domOpFilter=*/nullptr,
/*postDomOpFilter=*/nullptr,
/*allowNonDereferencingOps=*/true))) {
// If it failed (due to escapes for example), bail out.
newAlloc.erase();
return failure();
}
// Replace any uses of the original alloc op and erase it. All remaining uses
// have to be dealloc's; RAMUW above would've failed otherwise.
assert(llvm::all_of(oldMemRef.getUsers(), [](Operation *op) {
return isa<memref::DeallocOp>(op);
}));
oldMemRef.replaceAllUsesWith(newAlloc);
allocOp->erase();
return success();
}
MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b,
unsigned numSymbolicOperands) {
unsigned rank = memrefType.getRank();
if (rank == 0)
return memrefType;
if (memrefType.getLayout().isIdentity()) {
// Either no maps is associated with this memref or this memref has
// a trivial (identity) map.
return memrefType;
}
AffineMap layoutMap = memrefType.getLayout().getAffineMap();
// We don't do any checks for one-to-one'ness; we assume that it is
// one-to-one.
// Normalize only static memrefs and dynamic memrefs with a tiled-layout map
// for now.
// TODO: Normalize the other types of dynamic memrefs.
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
(void)getTileSizePos(layoutMap, tileSizePos);
if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty())
return memrefType;
// We have a single map that is not an identity map. Create a new memref
// with the right shape and an identity layout map.
ArrayRef<int64_t> shape = memrefType.getShape();
// FlatAffineConstraint may later on use symbolicOperands.
FlatAffineConstraints fac(rank, numSymbolicOperands);
SmallVector<unsigned, 4> memrefTypeDynDims;
for (unsigned d = 0; d < rank; ++d) {
// Use constraint system only in static dimensions.
if (shape[d] > 0) {
fac.addBound(FlatAffineConstraints::LB, d, 0);
fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1);
} else {
memrefTypeDynDims.emplace_back(d);
}
}
// We compose this map with the original index (logical) space to derive
// the upper bounds for the new index space.
unsigned newRank = layoutMap.getNumResults();
if (failed(fac.composeMatchingMap(layoutMap)))
return memrefType;
// TODO: Handle semi-affine maps.
// Project out the old data dimensions.
fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds());
SmallVector<int64_t, 4> newShape(newRank);
for (unsigned d = 0; d < newRank; ++d) {
// Check if each dimension of normalized memrefType is dynamic.
bool isDynDim = isNormalizedMemRefDynamicDim(
d, layoutMap, memrefTypeDynDims, b.getContext());
if (isDynDim) {
newShape[d] = -1;
} else {
// The lower bound for the shape is always zero.
auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d);
// For a static memref and an affine map with no symbols, this is
// always bounded.
assert(ubConst.hasValue() && "should always have an upper bound");
if (ubConst.getValue() < 0)
// This is due to an invalid map that maps to a negative space.
return memrefType;
// If dimension of new memrefType is dynamic, the value is -1.
newShape[d] = ubConst.getValue() + 1;
}
}
// Create the new memref type after trivializing the old layout map.
MemRefType newMemRefType =
MemRefType::Builder(memrefType)
.setShape(newShape)
.setLayout(AffineMapAttr::get(b.getMultiDimIdentityMap(newRank)));
return newMemRefType;
}

View File

@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/MemoryPromotion.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -19,7 +20,6 @@
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/ImplicitLocOpBuilder.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
using namespace mlir;
using namespace mlir::gpu;

View File

@@ -12,7 +12,6 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
#include "mlir/Dialect/SCF/Transforms.h"
@@ -20,7 +19,6 @@
#include "mlir/Dialect/Vector/VectorTransforms.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
using namespace mlir;

View File

@@ -12,7 +12,6 @@
#include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/SCF.h"
@@ -24,7 +23,6 @@
#include "mlir/IR/AsmState.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Dominance.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Debug.h"

View File

@@ -15,7 +15,6 @@
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/SCF.h"
@@ -27,7 +26,6 @@
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Dominance.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Debug.h"

View File

@@ -16,6 +16,8 @@
#include "PassDetail.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
@@ -29,9 +31,7 @@
#include "mlir/Pass/PassManager.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/Utils.h"
using namespace mlir;
using namespace mlir::vector;
@@ -348,7 +348,13 @@ struct LinalgStrategyEnablePass
return signalPassFailure();
}
promoteSingleIterationLoops(funcOp);
// Gathers all innermost loops through a post order pruned walk.
funcOp.walk([](Operation *op) {
if (auto forOp = dyn_cast<AffineForOp>(op))
(void)promoteIfSingleIteration(forOp);
else if (auto forOp = dyn_cast<scf::ForOp>(op))
(void)promoteIfSingleIteration(forOp);
});
if (options.hoistRedundantVectorTransfers)
hoistRedundantVectorTransfers(funcOp);

View File

@@ -12,7 +12,6 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"

View File

@@ -16,6 +16,7 @@
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -31,7 +32,6 @@
#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Debug.h"

View File

@@ -13,9 +13,9 @@
#include "PassDetail.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Transforms/Utils.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/Debug.h"

View File

@@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
LoopPipelining.cpp
LoopRangeFolding.cpp
LoopSpecialization.cpp
ParallelLoopCollapsing.cpp
ParallelLoopFusion.cpp
ParallelLoopTiling.cpp
StructuralTypeConversions.cpp

View File

@@ -7,9 +7,9 @@
//===----------------------------------------------------------------------===//
#include "PassDetail.h"
#include "mlir/Dialect/SCF/Passes.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Dialect/SCF/Utils.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -20,7 +20,7 @@ using namespace mlir;
namespace {
struct ParallelLoopCollapsing
: public ParallelLoopCollapsingBase<ParallelLoopCollapsing> {
: public SCFParallelLoopCollapsingBase<ParallelLoopCollapsing> {
void runOnOperation() override {
Operation *module = getOperation();

View File

@@ -11,20 +11,31 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/SCF/Utils.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Support/MathExtras.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
using namespace mlir;
namespace {
// This structure is to pass and return sets of loop parameters without
// confusing the order.
struct LoopParams {
Value lowerBound;
Value upperBound;
Value step;
};
} // namespace
scf::ForOp mlir::cloneWithNewYields(OpBuilder &b, scf::ForOp loop,
ValueRange newIterOperands,
ValueRange newYieldedValues,
@@ -230,3 +241,682 @@ bool mlir::getInnermostParallelLoops(Operation *rootOp,
}
return rootEnclosesPloops;
}
// Build the IR that performs ceil division of a positive value by a constant:
// ceildiv(a, B) = divis(a + (B-1), B)
// where divis is rounding-to-zero division.
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
int64_t divisor) {
assert(divisor > 0 && "expected positive divisor");
assert(dividend.getType().isIndex() && "expected index-typed value");
Value divisorMinusOneCst =
builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
return builder.create<arith::DivSIOp>(loc, sum, divisorCst);
}
// Build the IR that performs ceil division of a positive value by another
// positive value:
// ceildiv(a, b) = divis(a + (b - 1), b)
// where divis is rounding-to-zero division.
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
Value divisor) {
assert(dividend.getType().isIndex() && "expected index-typed value");
Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
return builder.create<arith::DivSIOp>(loc, sum, divisor);
}
/// Helper to replace uses of loop carried values (iter_args) and loop
/// yield values while promoting single iteration scf.for ops.
static void replaceIterArgsAndYieldResults(scf::ForOp forOp) {
// Replace uses of iter arguments with iter operands (initial values).
auto iterOperands = forOp.getIterOperands();
auto iterArgs = forOp.getRegionIterArgs();
for (auto e : llvm::zip(iterOperands, iterArgs))
std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
// Replace uses of loop results with the values yielded by the loop.
auto outerResults = forOp.getResults();
auto innerResults = forOp.getBody()->getTerminator()->getOperands();
for (auto e : llvm::zip(outerResults, innerResults))
std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
}
/// Promotes the loop body of a forOp to its containing block if the forOp
/// it can be determined that the loop has a single iteration.
LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) {
auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 ||
ubCstOp.value() < 0 || stepCstOp.value() < 0)
return failure();
int64_t tripCount =
mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value());
if (tripCount != 1)
return failure();
auto iv = forOp.getInductionVar();
iv.replaceAllUsesWith(lbCstOp);
replaceIterArgsAndYieldResults(forOp);
// Move the loop body operations, except for its terminator, to the loop's
// containing block.
auto *parentBlock = forOp->getBlock();
forOp.getBody()->getTerminator()->erase();
parentBlock->getOperations().splice(Block::iterator(forOp),
forOp.getBody()->getOperations());
forOp.erase();
return success();
}
/// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
/// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
/// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
/// unrolled iteration using annotateFn.
static void generateUnrolledLoop(
Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
ValueRange iterArgs, ValueRange yieldedValues) {
// Builder to insert unrolled bodies just before the terminator of the body of
// 'forOp'.
auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
if (!annotateFn)
annotateFn = [](unsigned, Operation *, OpBuilder) {};
// Keep a pointer to the last non-terminator operation in the original block
// so that we know what to clone (since we are doing this in-place).
Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
// Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
SmallVector<Value, 4> lastYielded(yieldedValues);
for (unsigned i = 1; i < unrollFactor; i++) {
BlockAndValueMapping operandMap;
// Prepare operand map.
operandMap.map(iterArgs, lastYielded);
// If the induction variable is used, create a remapping to the value for
// this unrolled instance.
if (!forOpIV.use_empty()) {
Value ivUnroll = ivRemapFn(i, forOpIV, builder);
operandMap.map(forOpIV, ivUnroll);
}
// Clone the original body of 'forOp'.
for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
Operation *clonedOp = builder.clone(*it, operandMap);
annotateFn(i, clonedOp, builder);
}
// Update yielded values.
for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
lastYielded[i] = operandMap.lookup(yieldedValues[i]);
}
// Make sure we annotate the Ops in the original body. We do this last so that
// any annotations are not copied into the cloned Ops above.
for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
annotateFn(0, &*it, builder);
// Update operands of the yield statement.
loopBodyBlock->getTerminator()->setOperands(lastYielded);
}
/// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
LogicalResult mlir::loopUnrollByFactor(
scf::ForOp forOp, uint64_t unrollFactor,
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
assert(unrollFactor > 0 && "expected positive unroll factor");
// Return if the loop body is empty.
if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
return success();
// Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
// 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
OpBuilder boundsBuilder(forOp);
auto loc = forOp.getLoc();
auto step = forOp.getStep();
Value upperBoundUnrolled;
Value stepUnrolled;
bool generateEpilogueLoop = true;
auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
if (lbCstOp && ubCstOp && stepCstOp) {
// Constant loop bounds computation.
int64_t lbCst = lbCstOp.value();
int64_t ubCst = ubCstOp.value();
int64_t stepCst = stepCstOp.value();
assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
"expected positive loop bounds and step");
int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
if (unrollFactor == 1) {
if (tripCount == 1 && failed(promoteIfSingleIteration(forOp)))
return failure();
return success();
}
int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
assert(upperBoundUnrolledCst <= ubCst);
int64_t stepUnrolledCst = stepCst * unrollFactor;
// Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
if (generateEpilogueLoop)
upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
loc, upperBoundUnrolledCst);
else
upperBoundUnrolled = ubCstOp;
// Create constant for 'stepUnrolled'.
stepUnrolled = stepCst == stepUnrolledCst
? step
: boundsBuilder.create<arith::ConstantIndexOp>(
loc, stepUnrolledCst);
} else {
// Dynamic loop bounds computation.
// TODO: Add dynamic asserts for negative lb/ub/step, or
// consider using ceilDiv from AffineApplyExpander.
auto lowerBound = forOp.getLowerBound();
auto upperBound = forOp.getUpperBound();
Value diff =
boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
Value unrollFactorCst =
boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
Value tripCountRem =
boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
// Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
Value tripCountEvenMultiple =
boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
// Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
loc, lowerBound,
boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
// Scale 'step' by 'unrollFactor'.
stepUnrolled =
boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
}
// Create epilogue clean up loop starting at 'upperBoundUnrolled'.
if (generateEpilogueLoop) {
OpBuilder epilogueBuilder(forOp->getContext());
epilogueBuilder.setInsertionPoint(forOp->getBlock(),
std::next(Block::iterator(forOp)));
auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
epilogueForOp.setLowerBound(upperBoundUnrolled);
// Update uses of loop results.
auto results = forOp.getResults();
auto epilogueResults = epilogueForOp.getResults();
auto epilogueIterOperands = epilogueForOp.getIterOperands();
for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) {
std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
}
(void)promoteIfSingleIteration(epilogueForOp);
}
// Create unrolled loop.
forOp.setUpperBound(upperBoundUnrolled);
forOp.setStep(stepUnrolled);
auto iterArgs = ValueRange(forOp.getRegionIterArgs());
auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
generateUnrolledLoop(
forOp.getBody(), forOp.getInductionVar(), unrollFactor,
[&](unsigned i, Value iv, OpBuilder b) {
// iv' = iv + step * i;
auto stride = b.create<arith::MulIOp>(
loc, step, b.create<arith::ConstantIndexOp>(loc, i));
return b.create<arith::AddIOp>(loc, iv, stride);
},
annotateFn, iterArgs, yieldedValues);
// Promote the loop body up if this has turned into a single iteration loop.
(void)promoteIfSingleIteration(forOp);
return success();
}
/// Return the new lower bound, upper bound, and step in that order. Insert any
/// additional bounds calculations before the given builder and any additional
/// conversion back to the original loop induction value inside the given Block.
static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
OpBuilder &insideLoopBuilder, Location loc,
Value lowerBound, Value upperBound, Value step,
Value inductionVar) {
// Check if the loop is already known to have a constant zero lower bound or
// a constant one step.
bool isZeroBased = false;
if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>())
isZeroBased = ubCst.value() == 0;
bool isStepOne = false;
if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>())
isStepOne = stepCst.value() == 1;
// Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
// assuming the step is strictly positive. Update the bounds and the step
// of the loop to go from 0 to the number of iterations, if necessary.
// TODO: introduce support for negative steps or emit dynamic asserts
// on step positivity, whatever gets implemented first.
if (isZeroBased && isStepOne)
return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
/*step=*/step};
Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
Value newLowerBound =
isZeroBased ? lowerBound
: boundsBuilder.create<arith::ConstantIndexOp>(loc, 0);
Value newStep =
isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1);
// Insert code computing the value of the original loop induction variable
// from the "normalized" one.
Value scaled =
isStepOne
? inductionVar
: insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
Value shifted =
isZeroBased
? scaled
: insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
shifted.getDefiningOp()};
inductionVar.replaceAllUsesExcept(shifted, preserve);
return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
/*step=*/newStep};
}
/// Transform a loop with a strictly positive step
/// for %i = %lb to %ub step %s
/// into a 0-based loop with step 1
/// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
/// %i = %ii * %s + %lb
/// Insert the induction variable remapping in the body of `inner`, which is
/// expected to be either `loop` or another loop perfectly nested under `loop`.
/// Insert the definition of new bounds immediate before `outer`, which is
/// expected to be either `loop` or its parent in the loop nest.
static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
OpBuilder builder(outer);
OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
loop.getLowerBound(), loop.getUpperBound(),
loop.getStep(), loop.getInductionVar());
loop.setLowerBound(loopPieces.lowerBound);
loop.setUpperBound(loopPieces.upperBound);
loop.setStep(loopPieces.step);
}
void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
if (loops.size() < 2)
return;
scf::ForOp innermost = loops.back();
scf::ForOp outermost = loops.front();
// 1. Make sure all loops iterate from 0 to upperBound with step 1. This
// allows the following code to assume upperBound is the number of iterations.
for (auto loop : loops)
normalizeLoop(loop, outermost, innermost);
// 2. Emit code computing the upper bound of the coalesced loop as product
// of the number of iterations of all loops.
OpBuilder builder(outermost);
Location loc = outermost.getLoc();
Value upperBound = outermost.getUpperBound();
for (auto loop : loops.drop_front())
upperBound =
builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
outermost.setUpperBound(upperBound);
builder.setInsertionPointToStart(outermost.getBody());
// 3. Remap induction variables. For each original loop, the value of the
// induction variable can be obtained by dividing the induction variable of
// the linearized loop by the total number of iterations of the loops nested
// in it modulo the number of iterations in this loop (remove the values
// related to the outer loops):
// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
// Compute these iteratively from the innermost loop by creating a "running
// quotient" of division by the range.
Value previous = outermost.getInductionVar();
for (unsigned i = 0, e = loops.size(); i < e; ++i) {
unsigned idx = loops.size() - i - 1;
if (i != 0)
previous = builder.create<arith::DivSIOp>(loc, previous,
loops[idx + 1].getUpperBound());
Value iv = (i == e - 1) ? previous
: builder.create<arith::RemSIOp>(
loc, previous, loops[idx].getUpperBound());
replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
loops.back().getRegion());
}
// 4. Move the operations from the innermost just above the second-outermost
// loop, delete the extra terminator and the second-outermost loop.
scf::ForOp second = loops[1];
innermost.getBody()->back().erase();
outermost.getBody()->getOperations().splice(
Block::iterator(second.getOperation()),
innermost.getBody()->getOperations());
second.erase();
}
void mlir::collapseParallelLoops(
scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
OpBuilder outsideBuilder(loops);
Location loc = loops.getLoc();
// Presort combined dimensions.
auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
for (auto &dims : sortedDimensions)
std::sort(dims.begin(), dims.end());
// Normalize ParallelOp's iteration pattern.
SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
normalizedUpperBounds;
for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
auto resultBounds =
normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
loops.getLowerBound()[i], loops.getUpperBound()[i],
loops.getStep()[i], loops.getBody()->getArgument(i));
normalizedLowerBounds.push_back(resultBounds.lowerBound);
normalizedUpperBounds.push_back(resultBounds.upperBound);
normalizedSteps.push_back(resultBounds.step);
}
// Combine iteration spaces.
SmallVector<Value, 3> lowerBounds, upperBounds, steps;
auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
for (auto idx : sortedDimensions[i]) {
newUpperBound = outsideBuilder.create<arith::MulIOp>(
loc, newUpperBound, normalizedUpperBounds[idx]);
}
lowerBounds.push_back(cst0);
steps.push_back(cst1);
upperBounds.push_back(newUpperBound);
}
// Create new ParallelLoop with conversions to the original induction values.
// The loop below uses divisions to get the relevant range of values in the
// new induction value that represent each range of the original induction
// value. The remainders then determine based on that range, which iteration
// of the original induction value this represents. This is a normalized value
// that is un-normalized already by the previous logic.
auto newPloop = outsideBuilder.create<scf::ParallelOp>(
loc, lowerBounds, upperBounds, steps,
[&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
Value previous = ploopIVs[i];
unsigned numberCombinedDimensions = combinedDimensions[i].size();
// Iterate over all except the last induction value.
for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
unsigned idx = combinedDimensions[i][j];
// Determine the current induction value's current loop iteration
Value iv = insideBuilder.create<arith::RemSIOp>(
loc, previous, normalizedUpperBounds[idx]);
replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
loops.getRegion());
// Remove the effect of the current induction value to prepare for
// the next value.
previous = insideBuilder.create<arith::DivSIOp>(
loc, previous, normalizedUpperBounds[idx]);
}
// The final induction value is just the remaining value.
unsigned idx = combinedDimensions[i][0];
replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
previous, loops.getRegion());
}
});
// Replace the old loop with the new loop.
loops.getBody()->back().erase();
newPloop.getBody()->getOperations().splice(
Block::iterator(newPloop.getBody()->back()),
loops.getBody()->getOperations());
loops.erase();
}
// Hoist the ops within `outer` that appear before `inner`.
// Such ops include the ops that have been introduced by parametric tiling.
// Ops that come from triangular loops (i.e. that belong to the program slice
// rooted at `outer`) and ops that have side effects cannot be hoisted.
// Return failure when any op fails to hoist.
static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
SetVector<Operation *> forwardSlice;
getForwardSlice(
outer.getInductionVar(), &forwardSlice,
[&inner](Operation *op) { return op != inner.getOperation(); });
LogicalResult status = success();
SmallVector<Operation *, 8> toHoist;
for (auto &op : outer.getBody()->without_terminator()) {
// Stop when encountering the inner loop.
if (&op == inner.getOperation())
break;
// Skip over non-hoistable ops.
if (forwardSlice.count(&op) > 0) {
status = failure();
continue;
}
// Skip intermediate scf::ForOp, these are not considered a failure.
if (isa<scf::ForOp>(op))
continue;
// Skip other ops with regions.
if (op.getNumRegions() > 0) {
status = failure();
continue;
}
// Skip if op has side effects.
// TODO: loads to immutable memory regions are ok.
if (!MemoryEffectOpInterface::hasNoEffect(&op)) {
status = failure();
continue;
}
toHoist.push_back(&op);
}
auto *outerForOp = outer.getOperation();
for (auto *op : toHoist)
op->moveBefore(outerForOp);
return status;
}
// Traverse the interTile and intraTile loops and try to hoist ops such that
// bands of perfectly nested loops are isolated.
// Return failure if either perfect interTile or perfect intraTile bands cannot
// be formed.
static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
LogicalResult status = success();
const Loops &interTile = tileLoops.first;
const Loops &intraTile = tileLoops.second;
auto size = interTile.size();
assert(size == intraTile.size());
if (size <= 1)
return success();
for (unsigned s = 1; s < size; ++s)
status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
: failure();
for (unsigned s = 1; s < size; ++s)
status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
: failure();
return status;
}
/// Collect perfectly nested loops starting from `rootForOps`. Loops are
/// perfectly nested if each loop is the first and only non-terminator operation
/// in the parent loop. Collect at most `maxLoops` loops and append them to
/// `forOps`.
template <typename T>
static void getPerfectlyNestedLoopsImpl(
SmallVectorImpl<T> &forOps, T rootForOp,
unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
for (unsigned i = 0; i < maxLoops; ++i) {
forOps.push_back(rootForOp);
Block &body = rootForOp.getRegion().front();
if (body.begin() != std::prev(body.end(), 2))
return;
rootForOp = dyn_cast<T>(&body.front());
if (!rootForOp)
return;
}
}
static Loops stripmineSink(scf::ForOp forOp, Value factor,
ArrayRef<scf::ForOp> targets) {
auto originalStep = forOp.getStep();
auto iv = forOp.getInductionVar();
OpBuilder b(forOp);
forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
Loops innerLoops;
for (auto t : targets) {
// Save information for splicing ops out of t when done
auto begin = t.getBody()->begin();
auto nOps = t.getBody()->getOperations().size();
// Insert newForOp before the terminator of `t`.
auto b = OpBuilder::atBlockTerminator((t.getBody()));
Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt,
forOp.getUpperBound(), stepped);
Value ub =
b.create<SelectOp>(t.getLoc(), less, forOp.getUpperBound(), stepped);
// Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
newForOp.getBody()->getOperations().splice(
newForOp.getBody()->getOperations().begin(),
t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
newForOp.getRegion());
innerLoops.push_back(newForOp);
}
return innerLoops;
}
// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
// Returns the new for operation, nested immediately under `target`.
template <typename SizeType>
static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor,
scf::ForOp target) {
// TODO: Use cheap structural assertions that targets are nested under
// forOp and that targets are not nested under each other when DominanceInfo
// exposes the capability. It seems overkill to construct a whole function
// dominance tree at this point.
auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target));
assert(res.size() == 1 && "Expected 1 inner forOp");
return res[0];
}
SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,
ArrayRef<Value> sizes,
ArrayRef<scf::ForOp> targets) {
SmallVector<SmallVector<scf::ForOp, 8>, 8> res;
SmallVector<scf::ForOp, 8> currentTargets(targets.begin(), targets.end());
for (auto it : llvm::zip(forOps, sizes)) {
auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
res.push_back(step);
currentTargets = step;
}
return res;
}
Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
scf::ForOp target) {
SmallVector<scf::ForOp, 8> res;
for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) {
assert(loops.size() == 1);
res.push_back(loops[0]);
}
return res;
}
Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
// Collect perfectly nested loops. If more size values provided than nested
// loops available, truncate `sizes`.
SmallVector<scf::ForOp, 4> forOps;
forOps.reserve(sizes.size());
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
if (forOps.size() < sizes.size())
sizes = sizes.take_front(forOps.size());
return ::tile(forOps, sizes, forOps.back());
}
void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
scf::ForOp root) {
getPerfectlyNestedLoopsImpl(nestedLoops, root);
}
TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
ArrayRef<int64_t> sizes) {
// Collect perfectly nested loops. If more size values provided than nested
// loops available, truncate `sizes`.
SmallVector<scf::ForOp, 4> forOps;
forOps.reserve(sizes.size());
getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
if (forOps.size() < sizes.size())
sizes = sizes.take_front(forOps.size());
// Compute the tile sizes such that i-th outer loop executes size[i]
// iterations. Given that the loop current executes
// numIterations = ceildiv((upperBound - lowerBound), step)
// iterations, we need to tile with size ceildiv(numIterations, size[i]).
SmallVector<Value, 4> tileSizes;
tileSizes.reserve(sizes.size());
for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
auto forOp = forOps[i];
OpBuilder builder(forOp);
auto loc = forOp.getLoc();
Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
forOp.getLowerBound());
Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
Value iterationsPerBlock =
ceilDivPositive(builder, loc, numIterations, sizes[i]);
tileSizes.push_back(iterationsPerBlock);
}
// Call parametric tiling with the given sizes.
auto intraTile = tile(forOps, tileSizes, forOps.back());
TileLoops tileLoops = std::make_pair(forOps, intraTile);
// TODO: for now we just ignore the result of band isolation.
// In the future, mapping decisions may be impacted by the ability to
// isolate perfectly nested bands.
(void)tryIsolateBands(tileLoops);
return tileLoops;
}

View File

@@ -14,7 +14,6 @@
#include <type_traits>
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"

View File

@@ -13,7 +13,6 @@
#include <type_traits>
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"

View File

@@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Vector/VectorTransforms.h"
#include "mlir/IR/ImplicitLocOpBuilder.h"
#include "mlir/Interfaces/VectorInterfaces.h"

View File

@@ -7,12 +7,95 @@
//===----------------------------------------------------------------------===//
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Support/Debug.h"
using namespace mlir;
#define DEBUG_TYPE "loop-like"
//===----------------------------------------------------------------------===//
// LoopLike Interfaces
//===----------------------------------------------------------------------===//
/// Include the definitions of the loop-like interfaces.
#include "mlir/Interfaces/LoopLikeInterface.cpp.inc"
//===----------------------------------------------------------------------===//
// LoopLike Utilities
//===----------------------------------------------------------------------===//
// Checks whether the given op can be hoisted by checking that
// - the op and any of its contained operations do not depend on SSA values
// defined inside of the loop (by means of calling definedOutside).
// - the op has no side-effects. If sideEffecting is Never, sideeffects of this
// op and its nested ops are ignored.
static bool canBeHoisted(Operation *op,
function_ref<bool(Value)> definedOutside) {
// Check that dependencies are defined outside of loop.
if (!llvm::all_of(op->getOperands(), definedOutside))
return false;
// Check whether this op is side-effect free. If we already know that there
// can be no side-effects because the surrounding op has claimed so, we can
// (and have to) skip this step.
if (auto memInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
if (!memInterface.hasNoEffect())
return false;
// If the operation doesn't have side effects and it doesn't recursively
// have side effects, it can always be hoisted.
if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>())
return true;
// Otherwise, if the operation doesn't provide the memory effect interface
// and it doesn't have recursive side effects we treat it conservatively as
// side-effecting.
} else if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
return false;
}
// Recurse into the regions for this op and check whether the contained ops
// can be hoisted.
for (auto &region : op->getRegions()) {
for (auto &block : region) {
for (auto &innerOp : block)
if (!canBeHoisted(&innerOp, definedOutside))
return false;
}
}
return true;
}
LogicalResult mlir::moveLoopInvariantCode(LoopLikeOpInterface looplike) {
auto &loopBody = looplike.getLoopBody();
// We use two collections here as we need to preserve the order for insertion
// and this is easiest.
SmallPtrSet<Operation *, 8> willBeMovedSet;
SmallVector<Operation *, 8> opsToMove;
// Helper to check whether an operation is loop invariant wrt. SSA properties.
auto isDefinedOutsideOfBody = [&](Value value) {
auto *definingOp = value.getDefiningOp();
return (definingOp && !!willBeMovedSet.count(definingOp)) ||
looplike.isDefinedOutsideOfLoop(value);
};
// Do not use walk here, as we do not want to go into nested regions and hoist
// operations from there. These regions might have semantics unknown to this
// rewriting. If the nested regions are loops, they will have been processed.
for (auto &block : loopBody) {
for (auto &op : block.without_terminator()) {
if (canBeHoisted(&op, isDefinedOutsideOfBody)) {
opsToMove.push_back(&op);
willBeMovedSet.insert(&op);
}
}
}
// For all instructions that we found to be invariant, move outside of the
// loop.
LogicalResult result = looplike.moveOutOfLoop(opsToMove);
LLVM_DEBUG(looplike.print(llvm::dbgs() << "\n\nModified loop:\n"));
return result;
}

View File

@@ -6,12 +6,8 @@ add_mlir_library(MLIRTransforms
CSE.cpp
Inliner.cpp
LocationSnapshot.cpp
LoopCoalescing.cpp
LoopFusion.cpp
LoopInvariantCodeMotion.cpp
OpStats.cpp
ParallelLoopCollapsing.cpp
PipelineDataTransfer.cpp
SCCP.cpp
StripDebugInfo.cpp
SymbolDCE.cpp
@@ -21,18 +17,13 @@ add_mlir_library(MLIRTransforms
${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
DEPENDS
MLIRStandardOpsIncGen
MLIRTransformsPassIncGen
LINK_LIBS PUBLIC
MLIRAffine
MLIRAnalysis
MLIRCopyOpInterface
MLIRLoopLikeInterface
MLIRMemRef
MLIRSCF
MLIRPass
MLIRSupport
MLIRTransformUtils
MLIRVector
)

View File

@@ -15,7 +15,6 @@
#include "mlir/IR/Dominance.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/Utils.h"
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/ScopedHashTable.h"

View File

@@ -16,8 +16,8 @@
#include "PassDetail.h"
#include "mlir/IR/Dominance.h"
#include "mlir/Interfaces/ControlFlowInterfaces.h"
#include "mlir/Transforms/ControlFlowSinkUtils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/Utils.h"
using namespace mlir;

View File

@@ -11,13 +11,10 @@
//===----------------------------------------------------------------------===//
#include "PassDetail.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -34,80 +31,6 @@ struct LoopInvariantCodeMotion
};
} // namespace
// Checks whether the given op can be hoisted by checking that
// - the op and any of its contained operations do not depend on SSA values
// defined inside of the loop (by means of calling definedOutside).
// - the op has no side-effects. If sideEffecting is Never, sideeffects of this
// op and its nested ops are ignored.
static bool canBeHoisted(Operation *op,
function_ref<bool(Value)> definedOutside) {
// Check that dependencies are defined outside of loop.
if (!llvm::all_of(op->getOperands(), definedOutside))
return false;
// Check whether this op is side-effect free. If we already know that there
// can be no side-effects because the surrounding op has claimed so, we can
// (and have to) skip this step.
if (auto memInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
if (!memInterface.hasNoEffect())
return false;
// If the operation doesn't have side effects and it doesn't recursively
// have side effects, it can always be hoisted.
if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>())
return true;
// Otherwise, if the operation doesn't provide the memory effect interface
// and it doesn't have recursive side effects we treat it conservatively as
// side-effecting.
} else if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
return false;
}
// Recurse into the regions for this op and check whether the contained ops
// can be hoisted.
for (auto &region : op->getRegions()) {
for (auto &block : region) {
for (auto &innerOp : block)
if (!canBeHoisted(&innerOp, definedOutside))
return false;
}
}
return true;
}
LogicalResult mlir::moveLoopInvariantCode(LoopLikeOpInterface looplike) {
auto &loopBody = looplike.getLoopBody();
// We use two collections here as we need to preserve the order for insertion
// and this is easiest.
SmallPtrSet<Operation *, 8> willBeMovedSet;
SmallVector<Operation *, 8> opsToMove;
// Helper to check whether an operation is loop invariant wrt. SSA properties.
auto isDefinedOutsideOfBody = [&](Value value) {
auto *definingOp = value.getDefiningOp();
return (definingOp && !!willBeMovedSet.count(definingOp)) ||
looplike.isDefinedOutsideOfLoop(value);
};
// Do not use walk here, as we do not want to go into nested regions and hoist
// operations from there. These regions might have semantics unknown to this
// rewriting. If the nested regions are loops, they will have been processed.
for (auto &block : loopBody) {
for (auto &op : block.without_terminator()) {
if (canBeHoisted(&op, isDefinedOutsideOfBody)) {
opsToMove.push_back(&op);
willBeMovedSet.insert(&op);
}
}
}
// For all instructions that we found to be invariant, move outside of the
// loop.
auto result = looplike.moveOutOfLoop(opsToMove);
LLVM_DEBUG(looplike.print(llvm::dbgs() << "\n\nModified loop:\n"));
return result;
}
void LoopInvariantCodeMotion::runOnOperation() {
// Walk through all loops in a function in innermost-loop-first order. This
// way, we first LICM from the inner loop, and place the ops in

View File

@@ -13,23 +13,8 @@
#include "mlir/Transforms/Passes.h"
namespace mlir {
class AffineDialect;
// Forward declaration from Dialect.h
template <typename ConcreteDialect>
void registerDialect(DialectRegistry &registry);
namespace arith {
class ArithmeticDialect;
} // namespace arith
namespace memref {
class MemRefDialect;
} // namespace memref
#define GEN_PASS_CLASSES
#include "mlir/Transforms/Passes.h.inc"
} // namespace mlir
#endif // TRANSFORMS_PASSDETAIL_H_

View File

@@ -4,25 +4,12 @@ add_mlir_library(MLIRTransformUtils
FoldUtils.cpp
GreedyPatternRewriteDriver.cpp
InliningUtils.cpp
LoopFusionUtils.cpp
LoopUtils.cpp
RegionUtils.cpp
Utils.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
DEPENDS
MLIRStandardOpsIncGen
LINK_LIBS PUBLIC
MLIRAffine
MLIRArithmetic
MLIRAnalysis
MLIRAffineAnalysis
MLIRMemRef
MLIRSCF
MLIRPass
MLIRRewrite
MLIRStandard
)

View File

@@ -18,10 +18,10 @@
//
//===----------------------------------------------------------------------===//
#include "mlir/Transforms/ControlFlowSinkUtils.h"
#include "mlir/IR/Dominance.h"
#include "mlir/IR/Matchers.h"
#include "mlir/Interfaces/ControlFlowInterfaces.h"
#include "mlir/Transforms/Utils.h"
#include <vector>
#define DEBUG_TYPE "cf-sink"

View File

@@ -13,7 +13,6 @@
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/FunctionInterfaces.h"
#include "mlir/Rewrite/PatternApplicator.h"
#include "mlir/Transforms/Utils.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"

View File

@@ -1,767 +0,0 @@
//===- Utils.cpp ---- Misc utilities for code and data transformation -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements miscellaneous transformation routines for non-loop IR
// structures.
//
//===----------------------------------------------------------------------===//
#include "mlir/Transforms/Utils.h"
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Dominance.h"
#include "mlir/Support/MathExtras.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/TypeSwitch.h"
#define DEBUG_TYPE "transforms-utils"
using namespace mlir;
// Perform the replacement in `op`.
LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
Operation *op,
ArrayRef<Value> extraIndices,
AffineMap indexRemap,
ArrayRef<Value> extraOperands,
ArrayRef<Value> symbolOperands,
bool allowNonDereferencingOps) {
unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
(void)newMemRefRank; // unused in opt mode
unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
(void)oldMemRefRank; // unused in opt mode
if (indexRemap) {
assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
"symbolic operand count mismatch");
assert(indexRemap.getNumInputs() ==
extraOperands.size() + oldMemRefRank + symbolOperands.size());
assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
} else {
assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
}
// Assert same elemental type.
assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
newMemRef.getType().cast<MemRefType>().getElementType());
SmallVector<unsigned, 2> usePositions;
for (const auto &opEntry : llvm::enumerate(op->getOperands())) {
if (opEntry.value() == oldMemRef)
usePositions.push_back(opEntry.index());
}
// If memref doesn't appear, nothing to do.
if (usePositions.empty())
return success();
if (usePositions.size() > 1) {
// TODO: extend it for this case when needed (rare).
assert(false && "multiple dereferencing uses in a single op not supported");
return failure();
}
unsigned memRefOperandPos = usePositions.front();
OpBuilder builder(op);
// The following checks if op is dereferencing memref and performs the access
// index rewrites.
auto affMapAccInterface = dyn_cast<AffineMapAccessInterface>(op);
if (!affMapAccInterface) {
if (!allowNonDereferencingOps) {
// Failure: memref used in a non-dereferencing context (potentially
// escapes); no replacement in these cases unless allowNonDereferencingOps
// is set.
return failure();
}
op->setOperand(memRefOperandPos, newMemRef);
return success();
}
// Perform index rewrites for the dereferencing op and then replace the op
NamedAttribute oldMapAttrPair =
affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef);
AffineMap oldMap = oldMapAttrPair.getValue().cast<AffineMapAttr>().getValue();
unsigned oldMapNumInputs = oldMap.getNumInputs();
SmallVector<Value, 4> oldMapOperands(
op->operand_begin() + memRefOperandPos + 1,
op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs);
// Apply 'oldMemRefOperands = oldMap(oldMapOperands)'.
SmallVector<Value, 4> oldMemRefOperands;
SmallVector<Value, 4> affineApplyOps;
oldMemRefOperands.reserve(oldMemRefRank);
if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) {
for (auto resultExpr : oldMap.getResults()) {
auto singleResMap = AffineMap::get(oldMap.getNumDims(),
oldMap.getNumSymbols(), resultExpr);
auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
oldMapOperands);
oldMemRefOperands.push_back(afOp);
affineApplyOps.push_back(afOp);
}
} else {
oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end());
}
// Construct new indices as a remap of the old ones if a remapping has been
// provided. The indices of a memref come right after it, i.e.,
// at position memRefOperandPos + 1.
SmallVector<Value, 4> remapOperands;
remapOperands.reserve(extraOperands.size() + oldMemRefRank +
symbolOperands.size());
remapOperands.append(extraOperands.begin(), extraOperands.end());
remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end());
remapOperands.append(symbolOperands.begin(), symbolOperands.end());
SmallVector<Value, 4> remapOutputs;
remapOutputs.reserve(oldMemRefRank);
if (indexRemap &&
indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
// Remapped indices.
for (auto resultExpr : indexRemap.getResults()) {
auto singleResMap = AffineMap::get(
indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr);
auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
remapOperands);
remapOutputs.push_back(afOp);
affineApplyOps.push_back(afOp);
}
} else {
// No remapping specified.
remapOutputs.assign(remapOperands.begin(), remapOperands.end());
}
SmallVector<Value, 4> newMapOperands;
newMapOperands.reserve(newMemRefRank);
// Prepend 'extraIndices' in 'newMapOperands'.
for (Value extraIndex : extraIndices) {
assert(extraIndex.getDefiningOp()->getNumResults() == 1 &&
"single result op's expected to generate these indices");
assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) &&
"invalid memory op index");
newMapOperands.push_back(extraIndex);
}
// Append 'remapOutputs' to 'newMapOperands'.
newMapOperands.append(remapOutputs.begin(), remapOutputs.end());
// Create new fully composed AffineMap for new op to be created.
assert(newMapOperands.size() == newMemRefRank);
auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
// TODO: Avoid creating/deleting temporary AffineApplyOps here.
fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
newMap = simplifyAffineMap(newMap);
canonicalizeMapAndOperands(&newMap, &newMapOperands);
// Remove any affine.apply's that became dead as a result of composition.
for (Value value : affineApplyOps)
if (value.use_empty())
value.getDefiningOp()->erase();
OperationState state(op->getLoc(), op->getName());
// Construct the new operation using this memref.
state.operands.reserve(op->getNumOperands() + extraIndices.size());
// Insert the non-memref operands.
state.operands.append(op->operand_begin(),
op->operand_begin() + memRefOperandPos);
// Insert the new memref value.
state.operands.push_back(newMemRef);
// Insert the new memref map operands.
state.operands.append(newMapOperands.begin(), newMapOperands.end());
// Insert the remaining operands unmodified.
state.operands.append(op->operand_begin() + memRefOperandPos + 1 +
oldMapNumInputs,
op->operand_end());
// Result types don't change. Both memref's are of the same elemental type.
state.types.reserve(op->getNumResults());
for (auto result : op->getResults())
state.types.push_back(result.getType());
// Add attribute for 'newMap', other Attributes do not change.
auto newMapAttr = AffineMapAttr::get(newMap);
for (auto namedAttr : op->getAttrs()) {
if (namedAttr.getName() == oldMapAttrPair.getName())
state.attributes.push_back({namedAttr.getName(), newMapAttr});
else
state.attributes.push_back(namedAttr);
}
// Create the new operation.
auto *repOp = builder.createOperation(state);
op->replaceAllUsesWith(repOp);
op->erase();
return success();
}
LogicalResult mlir::replaceAllMemRefUsesWith(
Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices,
AffineMap indexRemap, ArrayRef<Value> extraOperands,
ArrayRef<Value> symbolOperands, Operation *domOpFilter,
Operation *postDomOpFilter, bool allowNonDereferencingOps,
bool replaceInDeallocOp) {
unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
(void)newMemRefRank; // unused in opt mode
unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
(void)oldMemRefRank;
if (indexRemap) {
assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
"symbol operand count mismatch");
assert(indexRemap.getNumInputs() ==
extraOperands.size() + oldMemRefRank + symbolOperands.size());
assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
} else {
assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
}
// Assert same elemental type.
assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
newMemRef.getType().cast<MemRefType>().getElementType());
std::unique_ptr<DominanceInfo> domInfo;
std::unique_ptr<PostDominanceInfo> postDomInfo;
if (domOpFilter)
domInfo =
std::make_unique<DominanceInfo>(domOpFilter->getParentOfType<FuncOp>());
if (postDomOpFilter)
postDomInfo = std::make_unique<PostDominanceInfo>(
postDomOpFilter->getParentOfType<FuncOp>());
// Walk all uses of old memref; collect ops to perform replacement. We use a
// DenseSet since an operation could potentially have multiple uses of a
// memref (although rare), and the replacement later is going to erase ops.
DenseSet<Operation *> opsToReplace;
for (auto *op : oldMemRef.getUsers()) {
// Skip this use if it's not dominated by domOpFilter.
if (domOpFilter && !domInfo->dominates(domOpFilter, op))
continue;
// Skip this use if it's not post-dominated by postDomOpFilter.
if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op))
continue;
// Skip dealloc's - no replacement is necessary, and a memref replacement
// at other uses doesn't hurt these dealloc's.
if (isa<memref::DeallocOp>(op) && !replaceInDeallocOp)
continue;
// Check if the memref was used in a non-dereferencing context. It is fine
// for the memref to be used in a non-dereferencing way outside of the
// region where this replacement is happening.
if (!isa<AffineMapAccessInterface>(*op)) {
if (!allowNonDereferencingOps) {
LLVM_DEBUG(llvm::dbgs()
<< "Memref replacement failed: non-deferencing memref op: \n"
<< *op << '\n');
return failure();
}
// Non-dereferencing ops with the MemRefsNormalizable trait are
// supported for replacement.
if (!op->hasTrait<OpTrait::MemRefsNormalizable>()) {
LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a "
"memrefs normalizable trait: \n"
<< *op << '\n');
return failure();
}
}
// We'll first collect and then replace --- since replacement erases the op
// that has the use, and that op could be postDomFilter or domFilter itself!
opsToReplace.insert(op);
}
for (auto *op : opsToReplace) {
if (failed(replaceAllMemRefUsesWith(
oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands,
symbolOperands, allowNonDereferencingOps)))
llvm_unreachable("memref replacement guaranteed to succeed here");
}
return success();
}
/// Given an operation, inserts one or more single result affine
/// apply operations, results of which are exclusively used by this operation
/// operation. The operands of these newly created affine apply ops are
/// guaranteed to be loop iterators or terminal symbols of a function.
///
/// Before
///
/// affine.for %i = 0 to #map(%N)
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
/// "send"(%idx, %A, ...)
/// "compute"(%idx)
///
/// After
///
/// affine.for %i = 0 to #map(%N)
/// %idx = affine.apply (d0) -> (d0 mod 2) (%i)
/// "send"(%idx, %A, ...)
/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
/// "compute"(%idx_)
///
/// This allows applying different transformations on send and compute (for eg.
/// different shifts/delays).
///
/// Returns nullptr either if none of opInst's operands were the result of an
/// affine.apply and thus there was no affine computation slice to create, or if
/// all the affine.apply op's supplying operands to this opInst did not have any
/// uses besides this opInst; otherwise returns the list of affine.apply
/// operations created in output argument `sliceOps`.
void mlir::createAffineComputationSlice(
Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) {
// Collect all operands that are results of affine apply ops.
SmallVector<Value, 4> subOperands;
subOperands.reserve(opInst->getNumOperands());
for (auto operand : opInst->getOperands())
if (isa_and_nonnull<AffineApplyOp>(operand.getDefiningOp()))
subOperands.push_back(operand);
// Gather sequence of AffineApplyOps reachable from 'subOperands'.
SmallVector<Operation *, 4> affineApplyOps;
getReachableAffineApplyOps(subOperands, affineApplyOps);
// Skip transforming if there are no affine maps to compose.
if (affineApplyOps.empty())
return;
// Check if all uses of the affine apply op's lie only in this op op, in
// which case there would be nothing to do.
bool localized = true;
for (auto *op : affineApplyOps) {
for (auto result : op->getResults()) {
for (auto *user : result.getUsers()) {
if (user != opInst) {
localized = false;
break;
}
}
}
}
if (localized)
return;
OpBuilder builder(opInst);
SmallVector<Value, 4> composedOpOperands(subOperands);
auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
// Create an affine.apply for each of the map results.
sliceOps->reserve(composedMap.getNumResults());
for (auto resultExpr : composedMap.getResults()) {
auto singleResMap = AffineMap::get(composedMap.getNumDims(),
composedMap.getNumSymbols(), resultExpr);
sliceOps->push_back(builder.create<AffineApplyOp>(
opInst->getLoc(), singleResMap, composedOpOperands));
}
// Construct the new operands that include the results from the composed
// affine apply op above instead of existing ones (subOperands). So, they
// differ from opInst's operands only for those operands in 'subOperands', for
// which they will be replaced by the corresponding one from 'sliceOps'.
SmallVector<Value, 4> newOperands(opInst->getOperands());
for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
// Replace the subOperands from among the new operands.
unsigned j, f;
for (j = 0, f = subOperands.size(); j < f; j++) {
if (newOperands[i] == subOperands[j])
break;
}
if (j < subOperands.size()) {
newOperands[i] = (*sliceOps)[j];
}
}
for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
opInst->setOperand(idx, newOperands[idx]);
}
}
/// Enum to set patterns of affine expr in tiled-layout map.
/// TileFloorDiv: <dim expr> div <tile size>
/// TileMod: <dim expr> mod <tile size>
/// TileNone: None of the above
/// Example:
/// #tiled_2d_128x256 = affine_map<(d0, d1)
/// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
/// "d0 div 128" and "d1 div 256" ==> TileFloorDiv
/// "d0 mod 128" and "d1 mod 256" ==> TileMod
enum TileExprPattern { TileFloorDiv, TileMod, TileNone };
/// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions
/// being floordiv'ed by respective tile sizes appeare in a mod with the same
/// tile sizes, and no other expression involves those k dimensions. This
/// function stores a vector of tuples (`tileSizePos`) including AffineExpr for
/// tile size, positions of corresponding `floordiv` and `mod`. If it is not a
/// tiled layout, an empty vector is returned.
static LogicalResult getTileSizePos(
AffineMap map,
SmallVectorImpl<std::tuple<AffineExpr, unsigned, unsigned>> &tileSizePos) {
// Create `floordivExprs` which is a vector of tuples including LHS and RHS of
// `floordiv` and its position in `map` output.
// Example: #tiled_2d_128x256 = affine_map<(d0, d1)
// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
// In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}.
SmallVector<std::tuple<AffineExpr, AffineExpr, unsigned>, 4> floordivExprs;
unsigned pos = 0;
for (AffineExpr expr : map.getResults()) {
if (expr.getKind() == AffineExprKind::FloorDiv) {
AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
if (binaryExpr.getRHS().isa<AffineConstantExpr>())
floordivExprs.emplace_back(
std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos));
}
pos++;
}
// Not tiled layout if `floordivExprs` is empty.
if (floordivExprs.empty()) {
tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
return success();
}
// Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is
// not tiled layout.
for (std::tuple<AffineExpr, AffineExpr, unsigned> fexpr : floordivExprs) {
AffineExpr floordivExprLHS = std::get<0>(fexpr);
AffineExpr floordivExprRHS = std::get<1>(fexpr);
unsigned floordivPos = std::get<2>(fexpr);
// Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS
// of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used
// other expr, the map is not tiled layout. Example of non tiled layout:
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)>
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)>
// affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod
// 256)>
bool found = false;
pos = 0;
for (AffineExpr expr : map.getResults()) {
bool notTiled = false;
if (pos != floordivPos) {
expr.walk([&](AffineExpr e) {
if (e == floordivExprLHS) {
if (expr.getKind() == AffineExprKind::Mod) {
AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
// If LHS and RHS of `mod` are the same with those of floordiv.
if (floordivExprLHS == binaryExpr.getLHS() &&
floordivExprRHS == binaryExpr.getRHS()) {
// Save tile size (RHS of `mod`), and position of `floordiv` and
// `mod` if same expr with `mod` is not found yet.
if (!found) {
tileSizePos.emplace_back(
std::make_tuple(binaryExpr.getRHS(), floordivPos, pos));
found = true;
} else {
// Non tiled layout: Have multilpe `mod` with the same LHS.
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
// mod 256, d2 mod 256)>
notTiled = true;
}
} else {
// Non tiled layout: RHS of `mod` is different from `floordiv`.
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
// mod 128)>
notTiled = true;
}
} else {
// Non tiled layout: LHS is the same, but not `mod`.
// eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
// floordiv 256)>
notTiled = true;
}
}
});
}
if (notTiled) {
tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
return success();
}
pos++;
}
}
return success();
}
/// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic
/// after normalization. Dimensions that include dynamic dimensions in the map
/// output will become dynamic dimensions. Return true if `dim` is dynamic
/// dimension.
///
/// Example:
/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
///
/// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic.
/// memref<4x?xf32, #map0> ==> memref<4x?x?xf32>
static bool
isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap,
SmallVectorImpl<unsigned> &inMemrefTypeDynDims,
MLIRContext *context) {
bool isDynamicDim = false;
AffineExpr expr = layoutMap.getResults()[dim];
// Check if affine expr of the dimension includes dynamic dimension of input
// memrefType.
expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) {
if (e.isa<AffineDimExpr>()) {
for (unsigned dm : inMemrefTypeDynDims) {
if (e == getAffineDimExpr(dm, context)) {
isDynamicDim = true;
}
}
}
});
return isDynamicDim;
}
/// Create affine expr to calculate dimension size for a tiled-layout map.
static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput,
TileExprPattern pat) {
// Create map output for the patterns.
// "floordiv <tile size>" ==> "ceildiv <tile size>"
// "mod <tile size>" ==> "<tile size>"
AffineExpr newMapOutput;
AffineBinaryOpExpr binaryExpr = nullptr;
switch (pat) {
case TileExprPattern::TileMod:
binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
newMapOutput = binaryExpr.getRHS();
break;
case TileExprPattern::TileFloorDiv:
binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
newMapOutput = getAffineBinaryOpExpr(
AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS());
break;
default:
newMapOutput = oldMapOutput;
}
return newMapOutput;
}
/// Create new maps to calculate each dimension size of `newMemRefType`, and
/// create `newDynamicSizes` from them by using AffineApplyOp.
///
/// Steps for normalizing dynamic memrefs for a tiled layout map
/// Example:
/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
/// %0 = dim %arg0, %c1 :memref<4x?xf32>
/// %1 = alloc(%0) : memref<4x?xf32, #map0>
///
/// (Before this function)
/// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only
/// single layout map is supported.
///
/// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It
/// is memref<4x?x?xf32> in the above example.
///
/// (In this function)
/// 3. Create new maps to calculate each dimension of the normalized memrefType
/// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the
/// dimension size can be calculated by replacing "floordiv <tile size>" with
/// "ceildiv <tile size>" and "mod <tile size>" with "<tile size>".
/// - New map in the above example
/// #map0 = affine_map<(d0, d1) -> (d0)>
/// #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)>
/// #map2 = affine_map<(d0, d1) -> (32)>
///
/// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp
/// is used in dynamicSizes of new AllocOp.
/// %0 = dim %arg0, %c1 : memref<4x?xf32>
/// %c4 = arith.constant 4 : index
/// %1 = affine.apply #map1(%c4, %0)
/// %2 = affine.apply #map2(%c4, %0)
static void createNewDynamicSizes(MemRefType oldMemRefType,
MemRefType newMemRefType, AffineMap map,
memref::AllocOp *allocOp, OpBuilder b,
SmallVectorImpl<Value> &newDynamicSizes) {
// Create new input for AffineApplyOp.
SmallVector<Value, 4> inAffineApply;
ArrayRef<int64_t> oldMemRefShape = oldMemRefType.getShape();
unsigned dynIdx = 0;
for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) {
if (oldMemRefShape[d] < 0) {
// Use dynamicSizes of allocOp for dynamic dimension.
inAffineApply.emplace_back(allocOp->dynamicSizes()[dynIdx]);
dynIdx++;
} else {
// Create ConstantOp for static dimension.
Attribute constantAttr =
b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]);
inAffineApply.emplace_back(
b.create<arith::ConstantOp>(allocOp->getLoc(), constantAttr));
}
}
// Create new map to calculate each dimension size of new memref for each
// original map output. Only for dynamic dimesion of `newMemRefType`.
unsigned newDimIdx = 0;
ArrayRef<int64_t> newMemRefShape = newMemRefType.getShape();
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
(void)getTileSizePos(map, tileSizePos);
for (AffineExpr expr : map.getResults()) {
if (newMemRefShape[newDimIdx] < 0) {
// Create new maps to calculate each dimension size of new memref.
enum TileExprPattern pat = TileExprPattern::TileNone;
for (auto pos : tileSizePos) {
if (newDimIdx == std::get<1>(pos))
pat = TileExprPattern::TileFloorDiv;
else if (newDimIdx == std::get<2>(pos))
pat = TileExprPattern::TileMod;
}
AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat);
AffineMap newMap =
AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput);
Value affineApp =
b.create<AffineApplyOp>(allocOp->getLoc(), newMap, inAffineApply);
newDynamicSizes.emplace_back(affineApp);
}
newDimIdx++;
}
}
// TODO: Currently works for static memrefs with a single layout map.
LogicalResult mlir::normalizeMemRef(memref::AllocOp *allocOp) {
MemRefType memrefType = allocOp->getType();
OpBuilder b(*allocOp);
// Fetch a new memref type after normalizing the old memref to have an
// identity map layout.
MemRefType newMemRefType =
normalizeMemRefType(memrefType, b, allocOp->symbolOperands().size());
if (newMemRefType == memrefType)
// Either memrefType already had an identity map or the map couldn't be
// transformed to an identity map.
return failure();
Value oldMemRef = allocOp->getResult();
SmallVector<Value, 4> symbolOperands(allocOp->symbolOperands());
AffineMap layoutMap = memrefType.getLayout().getAffineMap();
memref::AllocOp newAlloc;
// Check if `layoutMap` is a tiled layout. Only single layout map is
// supported for normalizing dynamic memrefs.
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
(void)getTileSizePos(layoutMap, tileSizePos);
if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) {
MemRefType oldMemRefType = oldMemRef.getType().cast<MemRefType>();
SmallVector<Value, 4> newDynamicSizes;
createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b,
newDynamicSizes);
// Add the new dynamic sizes in new AllocOp.
newAlloc =
b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
newDynamicSizes, allocOp->alignmentAttr());
} else {
newAlloc = b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
allocOp->alignmentAttr());
}
// Replace all uses of the old memref.
if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc,
/*extraIndices=*/{},
/*indexRemap=*/layoutMap,
/*extraOperands=*/{},
/*symbolOperands=*/symbolOperands,
/*domOpFilter=*/nullptr,
/*postDomOpFilter=*/nullptr,
/*allowNonDereferencingOps=*/true))) {
// If it failed (due to escapes for example), bail out.
newAlloc.erase();
return failure();
}
// Replace any uses of the original alloc op and erase it. All remaining uses
// have to be dealloc's; RAMUW above would've failed otherwise.
assert(llvm::all_of(oldMemRef.getUsers(), [](Operation *op) {
return isa<memref::DeallocOp>(op);
}));
oldMemRef.replaceAllUsesWith(newAlloc);
allocOp->erase();
return success();
}
MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b,
unsigned numSymbolicOperands) {
unsigned rank = memrefType.getRank();
if (rank == 0)
return memrefType;
if (memrefType.getLayout().isIdentity()) {
// Either no maps is associated with this memref or this memref has
// a trivial (identity) map.
return memrefType;
}
AffineMap layoutMap = memrefType.getLayout().getAffineMap();
// We don't do any checks for one-to-one'ness; we assume that it is
// one-to-one.
// Normalize only static memrefs and dynamic memrefs with a tiled-layout map
// for now.
// TODO: Normalize the other types of dynamic memrefs.
SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
(void)getTileSizePos(layoutMap, tileSizePos);
if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty())
return memrefType;
// We have a single map that is not an identity map. Create a new memref
// with the right shape and an identity layout map.
ArrayRef<int64_t> shape = memrefType.getShape();
// FlatAffineConstraint may later on use symbolicOperands.
FlatAffineConstraints fac(rank, numSymbolicOperands);
SmallVector<unsigned, 4> memrefTypeDynDims;
for (unsigned d = 0; d < rank; ++d) {
// Use constraint system only in static dimensions.
if (shape[d] > 0) {
fac.addBound(FlatAffineConstraints::LB, d, 0);
fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1);
} else {
memrefTypeDynDims.emplace_back(d);
}
}
// We compose this map with the original index (logical) space to derive
// the upper bounds for the new index space.
unsigned newRank = layoutMap.getNumResults();
if (failed(fac.composeMatchingMap(layoutMap)))
return memrefType;
// TODO: Handle semi-affine maps.
// Project out the old data dimensions.
fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds());
SmallVector<int64_t, 4> newShape(newRank);
for (unsigned d = 0; d < newRank; ++d) {
// Check if each dimension of normalized memrefType is dynamic.
bool isDynDim = isNormalizedMemRefDynamicDim(
d, layoutMap, memrefTypeDynDims, b.getContext());
if (isDynDim) {
newShape[d] = -1;
} else {
// The lower bound for the shape is always zero.
auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d);
// For a static memref and an affine map with no symbols, this is
// always bounded.
assert(ubConst.hasValue() && "should always have an upper bound");
if (ubConst.getValue() < 0)
// This is due to an invalid map that maps to a negative space.
return memrefType;
// If dimension of new memrefType is dynamic, the value is -1.
newShape[d] = ubConst.getValue() + 1;
}
}
// Create the new memref type after trivializing the old layout map.
MemRefType newMemRefType =
MemRefType::Builder(memrefType)
.setShape(newShape)
.setLayout(AffineMapAttr::get(b.getMultiDimIdentityMap(newRank)));
return newMemRefType;
}

View File

@@ -3,6 +3,8 @@ add_mlir_library(MLIRAffineTransformsTestPasses
TestAffineDataCopy.cpp
TestAffineLoopUnswitching.cpp
TestAffineLoopParametricTiling.cpp
TestLoopFusion.cpp
TestLoopMapping.cpp
TestLoopPermutation.cpp
TestVectorizationUtils.cpp

View File

@@ -13,10 +13,10 @@
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#define PASS_NAME "test-affine-data-copy"

View File

@@ -12,8 +12,8 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Transforms/LoopUtils.h"
using namespace mlir;

View File

@@ -12,11 +12,10 @@
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopFusionUtils.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopFusionUtils.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#define DEBUG_TYPE "test-loop-fusion"

View File

@@ -12,11 +12,10 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/IR/Builders.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "llvm/ADT/SetVector.h"

View File

@@ -12,9 +12,8 @@
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#define PASS_NAME "test-loop-permutation"

View File

@@ -14,6 +14,7 @@
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Vector/VectorOps.h"
#include "mlir/Dialect/Vector/VectorUtils.h"
@@ -21,7 +22,6 @@
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "llvm/ADT/STLExtras.h"

View File

@@ -1,5 +1,7 @@
# Exclude tests from libMLIR.so
add_mlir_library(MLIRSCFTestPasses
TestLoopParametricTiling.cpp
TestLoopUnrolling.cpp
TestSCFUtils.cpp
EXCLUDE_FROM_LIBMLIR

View File

@@ -11,10 +11,9 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/SCF/Utils.h"
#include "mlir/IR/Builders.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
using namespace mlir;
@@ -31,8 +30,7 @@ public:
}
StringRef getDescription() const final {
return "test application of parametric tiling to the outer loops so that "
"the "
"ranges of outer loops become static";
"the ranges of outer loops become static";
}
SimpleParametricLoopTilingPass() = default;
SimpleParametricLoopTilingPass(const SimpleParametricLoopTilingPass &) {}

View File

@@ -12,11 +12,10 @@
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/SCF/Utils.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/Builders.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
using namespace mlir;

View File

@@ -2,10 +2,6 @@
add_mlir_library(MLIRTestTransforms
TestConstantFold.cpp
TestInlining.cpp
TestLoopFusion.cpp
TestLoopMapping.cpp
TestLoopParametricTiling.cpp
TestLoopUnrolling.cpp
EXCLUDE_FROM_LIBMLIR

View File

@@ -9,7 +9,6 @@
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/FoldUtils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/Utils.h"
using namespace mlir;

View File

@@ -4,4 +4,5 @@ add_mlir_unittest(MLIRTransformsTests
)
target_link_libraries(MLIRTransformsTests
PRIVATE
MLIRParser
MLIRTransforms)