Files
clang-p2996/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
Michele Scuttari 67d0d7ac0a [MLIR] Update pass declarations to new autogenerated files
The patch introduces the required changes to update the pass declarations and definitions to use the new autogenerated files and allow dropping the old infrastructure.

Reviewed By: mehdi_amini, rriddle

Differential Review: https://reviews.llvm.org/D132838
2022-08-31 12:28:45 +02:00

215 lines
9.0 KiB
C++

//===- ParallelLoopTiling.cpp - Tiles scf.parallel ------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements loop tiling on parallel loops.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/SCF/Transforms/Passes.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/Utils/Utils.h"
namespace mlir {
#define GEN_PASS_DEF_SCFPARALLELLOOPTILING
#include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
} // namespace mlir
using namespace mlir;
using namespace mlir::scf;
/// Tile a parallel loop of the form
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
/// step (%arg4, %arg5)
///
/// into
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
/// step (%arg4*tileSize[0],
/// %arg5*tileSize[1])
/// scf.parallel (%j0, %j1) = (0, 0) to (min(%arg4*tileSize[0], %arg2-%i0)
/// min(%arg5*tileSize[1], %arg3-%i1))
/// step (%arg4, %arg5)
///
/// or, when no-min-max-bounds is true, into
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
/// step (%arg4*tileSize[0],
/// %arg5*tileSize[1])
/// scf.parallel (%j0, %j1) = (0, 0) to (%arg4*tileSize[0],
/// %arg5*tileSize[1])
/// step (%arg4, %arg5)
/// %inbound = (%j0 * %arg4 + %i0 < %arg2) &&
/// (%j1 * %arg5 + %i1 < %arg3)
/// scf.if (%inbound)
/// ....
///
/// where the uses of %i0 and %i1 in the loop body are replaced by
/// %i0 + j0 and %i1 + %j1.
///
/// The old loop is replaced with the new one.
std::pair<ParallelOp, ParallelOp>
mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes,
bool noMinMaxBounds) {
OpBuilder b(op);
auto zero = b.create<arith::ConstantIndexOp>(op.getLoc(), 0);
SmallVector<Value, 2> tileSizeConstants;
tileSizeConstants.reserve(op.getUpperBound().size());
for (size_t i = 0, end = op.getUpperBound().size(); i != end; ++i) {
if (i < tileSizes.size())
tileSizeConstants.push_back(
b.create<arith::ConstantIndexOp>(op.getLoc(), tileSizes[i]));
else
// Just pick 1 for the remaining dimensions.
tileSizeConstants.push_back(
b.create<arith::ConstantIndexOp>(op.getLoc(), 1));
}
// Create the outer loop with adjusted steps.
SmallVector<Value, 2> newSteps;
newSteps.reserve(op.getStep().size());
for (auto step : llvm::zip(op.getStep(), tileSizeConstants)) {
newSteps.push_back(b.create<arith::MulIOp>(op.getLoc(), std::get<0>(step),
std::get<1>(step)));
}
auto outerLoop = b.create<ParallelOp>(op.getLoc(), op.getLowerBound(),
op.getUpperBound(), newSteps);
b.setInsertionPointToStart(outerLoop.getBody());
// Compute min(size, dim - offset) to avoid out-of-bounds accesses.
auto minMap = AffineMap::get(
/*dimCount=*/3, /*symbolCount=*/0,
{getAffineDimExpr(/*position=*/0, b.getContext()),
getAffineDimExpr(/*position=*/1, b.getContext()) -
getAffineDimExpr(/*position=*/2, b.getContext())},
b.getContext());
// Create the inner loop with adjusted bounds.
SmallVector<Value, 2> newBounds;
newBounds.reserve(op.getUpperBound().size());
bool needInboundCheck = false;
for (auto [lowerBound, upperBound, newStep, iv, step, tileSizeConstant] :
llvm::zip(outerLoop.getLowerBound(), outerLoop.getUpperBound(),
outerLoop.getStep(), outerLoop.getInductionVars(),
op.getStep(), tileSizeConstants)) {
// Collect the statically known loop bounds
auto lowerBoundConstant =
dyn_cast_or_null<arith::ConstantIndexOp>(lowerBound.getDefiningOp());
auto upperBoundConstant =
dyn_cast_or_null<arith::ConstantIndexOp>(upperBound.getDefiningOp());
auto stepConstant =
dyn_cast_or_null<arith::ConstantIndexOp>(step.getDefiningOp());
auto tileSize =
cast<arith::ConstantIndexOp>(tileSizeConstant.getDefiningOp()).value();
// If the loop bounds and the loop step are constant and if the number of
// loop iterations is an integer multiple of the tile size, we use a static
// bound for the inner loop.
if (lowerBoundConstant && upperBoundConstant && stepConstant) {
auto numIterations = llvm::divideCeil(upperBoundConstant.value() -
lowerBoundConstant.value(),
stepConstant.value());
if (numIterations % tileSize == 0) {
newBounds.push_back(newStep);
continue;
}
}
// For InboundCheck mode, just use the variable outer step
if (noMinMaxBounds) {
newBounds.push_back(newStep);
needInboundCheck = true;
continue;
}
// Otherwise, we dynamically compute the bound for
// each iteration of the outer loop.
newBounds.push_back(
b.create<AffineMinOp>(op.getLoc(), b.getIndexType(), minMap,
ValueRange{newStep, upperBound, iv}));
}
auto innerLoop = b.create<ParallelOp>(
op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,
op.getStep());
if (noMinMaxBounds && needInboundCheck) {
b.setInsertionPointToStart(innerLoop.getBody());
// Insert in-bound check
Value inbound =
b.create<arith::ConstantIntOp>(op.getLoc(), 1, b.getIntegerType(1));
for (auto [outerUpperBound, outerIV, innerIV, innerStep] :
llvm::zip(outerLoop.getUpperBound(), outerLoop.getInductionVars(),
innerLoop.getInductionVars(), innerLoop.getStep())) {
// %in_bound = %in_bound &&
// (%inner_iv * %inner_step + %outer_iv < %outer_upper_bound)
Value index = b.create<arith::AddIOp>(
op.getLoc(), b.create<arith::MulIOp>(op.getLoc(), innerIV, innerStep),
outerIV);
Value dimInbound = b.create<arith::CmpIOp>(
op.getLoc(), arith::CmpIPredicate::ult, index, outerUpperBound);
inbound = b.create<arith::AndIOp>(op.getLoc(), inbound, dimInbound);
}
auto ifInbound = b.create<IfOp>(op.getLoc(),
/*resultTypes*/ ArrayRef<Type>{}, inbound,
/*hasElseRegion*/ false);
ifInbound.getThenRegion().takeBody(op.getRegion());
Block &thenBlock = ifInbound.getThenRegion().front();
b.setInsertionPointToStart(innerLoop.getBody());
for (const auto &ivs : llvm::enumerate(llvm::zip(
innerLoop.getInductionVars(), outerLoop.getInductionVars()))) {
auto newIndex = b.create<arith::AddIOp>(
op.getLoc(), std::get<0>(ivs.value()), std::get<1>(ivs.value()));
thenBlock.getArgument(ivs.index())
.replaceAllUsesExcept(newIndex, newIndex);
}
thenBlock.eraseArguments(0, thenBlock.getNumArguments());
} else {
innerLoop.getRegion().takeBody(op.getRegion());
b.setInsertionPointToStart(innerLoop.getBody());
for (auto ivs : llvm::zip(innerLoop.getInductionVars(),
outerLoop.getInductionVars())) {
Value innerIndex = std::get<0>(ivs);
auto newIndex = b.create<arith::AddIOp>(op.getLoc(), std::get<0>(ivs),
std::get<1>(ivs));
innerIndex.replaceAllUsesExcept(newIndex, newIndex);
}
}
op.erase();
return std::make_pair(outerLoop, innerLoop);
}
namespace {
struct ParallelLoopTiling
: public impl::SCFParallelLoopTilingBase<ParallelLoopTiling> {
ParallelLoopTiling() = default;
explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes,
bool noMinMaxBounds = false) {
this->tileSizes = tileSizes;
this->noMinMaxBounds = noMinMaxBounds;
}
void runOnOperation() override {
auto *parentOp = getOperation();
SmallVector<ParallelOp, 2> innermostPloops;
getInnermostParallelLoops(parentOp, innermostPloops);
for (ParallelOp ploop : innermostPloops) {
// FIXME: Add reduction support.
if (ploop.getNumReductions() == 0)
tileParallelLoop(ploop, tileSizes, noMinMaxBounds);
}
}
};
} // namespace
std::unique_ptr<Pass>
mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes,
bool noMinMaxBounds) {
return std::make_unique<ParallelLoopTiling>(tileSizes, noMinMaxBounds);
}