The only benefit of FunctionPass is that it filters out function declarations. This isn't enough to justify carrying it around, as we can simplify filter out declarations when necessary within the pass. We can also explore with better scheduling primitives to filter out declarations at the pipeline level in the future. The definition of FunctionPass is left intact for now to allow time for downstream users to migrate. Differential Revision: https://reviews.llvm.org/D117182
215 lines
9.0 KiB
C++
215 lines
9.0 KiB
C++
//===- ParallelLoopTiling.cpp - Tiles scf.parallel ---------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements loop tiling on parallel loops.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "PassDetail.h"
|
|
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
|
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
|
#include "mlir/Dialect/SCF/Passes.h"
|
|
#include "mlir/Dialect/SCF/SCF.h"
|
|
#include "mlir/Dialect/SCF/Transforms.h"
|
|
#include "mlir/Dialect/SCF/Utils.h"
|
|
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::scf;
|
|
|
|
/// Tile a parallel loop of the form
|
|
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
|
|
/// step (%arg4, %arg5)
|
|
///
|
|
/// into
|
|
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
|
|
/// step (%arg4*tileSize[0],
|
|
/// %arg5*tileSize[1])
|
|
/// scf.parallel (%j0, %j1) = (0, 0) to (min(%arg4*tileSize[0], %arg2-%i0)
|
|
/// min(%arg5*tileSize[1], %arg3-%i1))
|
|
/// step (%arg4, %arg5)
|
|
///
|
|
/// or, when no-min-max-bounds is true, into
|
|
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
|
|
/// step (%arg4*tileSize[0],
|
|
/// %arg5*tileSize[1])
|
|
/// scf.parallel (%j0, %j1) = (0, 0) to (%arg4*tileSize[0],
|
|
/// %arg5*tileSize[1])
|
|
/// step (%arg4, %arg5)
|
|
/// %inbound = (%j0 * %arg4 + %i0 < %arg2) &&
|
|
/// (%j1 * %arg5 + %i1 < %arg3)
|
|
/// scf.if (%inbound)
|
|
/// ....
|
|
///
|
|
/// where the uses of %i0 and %i1 in the loop body are replaced by
|
|
/// %i0 + j0 and %i1 + %j1.
|
|
//
|
|
/// The old loop is replaced with the new one.
|
|
std::pair<ParallelOp, ParallelOp>
|
|
mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes,
|
|
bool noMinMaxBounds) {
|
|
OpBuilder b(op);
|
|
auto zero = b.create<arith::ConstantIndexOp>(op.getLoc(), 0);
|
|
SmallVector<Value, 2> tileSizeConstants;
|
|
tileSizeConstants.reserve(op.getUpperBound().size());
|
|
for (size_t i = 0, end = op.getUpperBound().size(); i != end; ++i) {
|
|
if (i < tileSizes.size())
|
|
tileSizeConstants.push_back(
|
|
b.create<arith::ConstantIndexOp>(op.getLoc(), tileSizes[i]));
|
|
else
|
|
// Just pick 1 for the remaining dimensions.
|
|
tileSizeConstants.push_back(
|
|
b.create<arith::ConstantIndexOp>(op.getLoc(), 1));
|
|
}
|
|
|
|
// Create the outer loop with adjusted steps.
|
|
SmallVector<Value, 2> newSteps;
|
|
newSteps.reserve(op.getStep().size());
|
|
for (auto step : llvm::zip(op.getStep(), tileSizeConstants)) {
|
|
newSteps.push_back(b.create<arith::MulIOp>(op.getLoc(), std::get<0>(step),
|
|
std::get<1>(step)));
|
|
}
|
|
auto outerLoop = b.create<ParallelOp>(op.getLoc(), op.getLowerBound(),
|
|
op.getUpperBound(), newSteps);
|
|
b.setInsertionPointToStart(outerLoop.getBody());
|
|
|
|
// Compute min(size, dim - offset) to avoid out-of-bounds accesses.
|
|
auto minMap = AffineMap::get(
|
|
/*dimCount=*/3, /*symbolCount=*/0,
|
|
{getAffineDimExpr(/*position=*/0, b.getContext()),
|
|
getAffineDimExpr(/*position=*/1, b.getContext()) -
|
|
getAffineDimExpr(/*position=*/2, b.getContext())},
|
|
b.getContext());
|
|
|
|
// Create the inner loop with adjusted bounds.
|
|
SmallVector<Value, 2> newBounds;
|
|
newBounds.reserve(op.getUpperBound().size());
|
|
bool needInboundCheck = false;
|
|
for (auto dim :
|
|
llvm::zip(outerLoop.getLowerBound(), outerLoop.getUpperBound(),
|
|
outerLoop.getStep(), outerLoop.getInductionVars(),
|
|
op.getStep(), tileSizeConstants)) {
|
|
Value lowerBound, upperBound, newStep, iv, step, tileSizeConstant;
|
|
std::tie(lowerBound, upperBound, newStep, iv, step, tileSizeConstant) = dim;
|
|
// Collect the statically known loop bounds
|
|
auto lowerBoundConstant =
|
|
dyn_cast_or_null<arith::ConstantIndexOp>(lowerBound.getDefiningOp());
|
|
auto upperBoundConstant =
|
|
dyn_cast_or_null<arith::ConstantIndexOp>(upperBound.getDefiningOp());
|
|
auto stepConstant =
|
|
dyn_cast_or_null<arith::ConstantIndexOp>(step.getDefiningOp());
|
|
auto tileSize =
|
|
cast<arith::ConstantIndexOp>(tileSizeConstant.getDefiningOp()).value();
|
|
// If the loop bounds and the loop step are constant and if the number of
|
|
// loop iterations is an integer multiple of the tile size, we use a static
|
|
// bound for the inner loop.
|
|
if (lowerBoundConstant && upperBoundConstant && stepConstant) {
|
|
auto numIterations = llvm::divideCeil(upperBoundConstant.value() -
|
|
lowerBoundConstant.value(),
|
|
stepConstant.value());
|
|
if (numIterations % tileSize == 0) {
|
|
newBounds.push_back(newStep);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// For InboundCheck mode, just use the variable outer step
|
|
if (noMinMaxBounds) {
|
|
newBounds.push_back(newStep);
|
|
needInboundCheck = true;
|
|
continue;
|
|
}
|
|
|
|
// Otherwise, we dynamically compute the bound for
|
|
// each iteration of the outer loop.
|
|
newBounds.push_back(
|
|
b.create<AffineMinOp>(op.getLoc(), b.getIndexType(), minMap,
|
|
ValueRange{newStep, upperBound, iv}));
|
|
}
|
|
auto innerLoop = b.create<ParallelOp>(
|
|
op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,
|
|
op.getStep());
|
|
|
|
if (noMinMaxBounds && needInboundCheck) {
|
|
b.setInsertionPointToStart(innerLoop.getBody());
|
|
// Insert in-bound check
|
|
Value inbound =
|
|
b.create<arith::ConstantIntOp>(op.getLoc(), 1, b.getIntegerType(1));
|
|
for (auto dim :
|
|
llvm::zip(outerLoop.getUpperBound(), outerLoop.getInductionVars(),
|
|
innerLoop.getInductionVars(), innerLoop.getStep())) {
|
|
Value outerUpperBound, outerIV, innerIV, innerStep;
|
|
std::tie(outerUpperBound, outerIV, innerIV, innerStep) = dim;
|
|
// %in_bound = %in_bound &&
|
|
// (%inner_iv * %inner_step + %outer_iv < %outer_upper_bound)
|
|
Value index = b.create<arith::AddIOp>(
|
|
op.getLoc(), b.create<arith::MulIOp>(op.getLoc(), innerIV, innerStep),
|
|
outerIV);
|
|
Value dimInbound = b.create<arith::CmpIOp>(
|
|
op.getLoc(), arith::CmpIPredicate::ult, index, outerUpperBound);
|
|
inbound = b.create<arith::AndIOp>(op.getLoc(), inbound, dimInbound);
|
|
}
|
|
auto ifInbound = b.create<IfOp>(op.getLoc(),
|
|
/*resultTypes*/ ArrayRef<Type>{}, inbound,
|
|
/*hasElseRegion*/ false);
|
|
ifInbound.getThenRegion().takeBody(op.getRegion());
|
|
Block &thenBlock = ifInbound.getThenRegion().front();
|
|
b.setInsertionPointToStart(innerLoop.getBody());
|
|
for (const auto &ivs : llvm::enumerate(llvm::zip(
|
|
innerLoop.getInductionVars(), outerLoop.getInductionVars()))) {
|
|
auto newIndex = b.create<arith::AddIOp>(
|
|
op.getLoc(), std::get<0>(ivs.value()), std::get<1>(ivs.value()));
|
|
thenBlock.getArgument(ivs.index())
|
|
.replaceAllUsesExcept(newIndex, newIndex);
|
|
}
|
|
thenBlock.eraseArguments(llvm::to_vector<4>(
|
|
llvm::seq((unsigned)0, thenBlock.getNumArguments())));
|
|
} else {
|
|
innerLoop.getRegion().takeBody(op.getRegion());
|
|
b.setInsertionPointToStart(innerLoop.getBody());
|
|
for (auto ivs : llvm::zip(innerLoop.getInductionVars(),
|
|
outerLoop.getInductionVars())) {
|
|
Value innerIndex = std::get<0>(ivs);
|
|
auto newIndex = b.create<arith::AddIOp>(op.getLoc(), std::get<0>(ivs),
|
|
std::get<1>(ivs));
|
|
innerIndex.replaceAllUsesExcept(newIndex, newIndex);
|
|
}
|
|
}
|
|
|
|
op.erase();
|
|
return std::make_pair(outerLoop, innerLoop);
|
|
}
|
|
|
|
namespace {
|
|
struct ParallelLoopTiling
|
|
: public SCFParallelLoopTilingBase<ParallelLoopTiling> {
|
|
ParallelLoopTiling() = default;
|
|
explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes,
|
|
bool noMinMaxBounds = false) {
|
|
this->tileSizes = tileSizes;
|
|
this->noMinMaxBounds = noMinMaxBounds;
|
|
}
|
|
|
|
void runOnOperation() override {
|
|
SmallVector<ParallelOp, 2> innermostPloops;
|
|
getInnermostParallelLoops(getOperation().getOperation(), innermostPloops);
|
|
for (ParallelOp ploop : innermostPloops) {
|
|
// FIXME: Add reduction support.
|
|
if (ploop.getNumReductions() == 0)
|
|
tileParallelLoop(ploop, tileSizes, noMinMaxBounds);
|
|
}
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
std::unique_ptr<Pass>
|
|
mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes,
|
|
bool noMinMaxBounds) {
|
|
return std::make_unique<ParallelLoopTiling>(tileSizes, noMinMaxBounds);
|
|
}
|