Extends `do concurrent` mapping to handle "loop-local values". A loop-local value is one that is used exclusively inside the loop but allocated outside of it. This usually corresponds to temporary values that are used inside the loop body for initialzing other variables for example. After collecting these values, the pass localizes them to the loop nest by moving their allocations. PR stack: - https://github.com/llvm/llvm-project/pull/126026 - https://github.com/llvm/llvm-project/pull/127595 - https://github.com/llvm/llvm-project/pull/127633 - https://github.com/llvm/llvm-project/pull/127634 - https://github.com/llvm/llvm-project/pull/127635 (this PR)
583 lines
22 KiB
C++
583 lines
22 KiB
C++
//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "flang/Optimizer/Dialect/FIROps.h"
|
|
#include "flang/Optimizer/OpenMP/Passes.h"
|
|
#include "flang/Optimizer/OpenMP/Utils.h"
|
|
#include "mlir/Analysis/SliceAnalysis.h"
|
|
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
|
|
#include "mlir/IR/IRMapping.h"
|
|
#include "mlir/Transforms/DialectConversion.h"
|
|
#include "mlir/Transforms/RegionUtils.h"
|
|
|
|
namespace flangomp {
|
|
#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
|
|
#include "flang/Optimizer/OpenMP/Passes.h.inc"
|
|
} // namespace flangomp
|
|
|
|
#define DEBUG_TYPE "do-concurrent-conversion"
|
|
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
|
|
|
|
namespace {
|
|
namespace looputils {
|
|
/// Stores info needed about the induction/iteration variable for each `do
|
|
/// concurrent` in a loop nest.
|
|
struct InductionVariableInfo {
|
|
InductionVariableInfo(fir::DoLoopOp doLoop) { populateInfo(doLoop); }
|
|
|
|
/// The operation allocating memory for iteration variable.
|
|
mlir::Operation *iterVarMemDef;
|
|
/// the operation(s) updating the iteration variable with the current
|
|
/// iteration number.
|
|
llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps;
|
|
|
|
private:
|
|
/// For the \p doLoop parameter, find the following:
|
|
///
|
|
/// 1. The operation that declares its iteration variable or allocates memory
|
|
/// for it. For example, give the following loop:
|
|
/// ```
|
|
/// ...
|
|
/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
|
|
/// ...
|
|
/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
|
|
/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
|
|
/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
|
|
/// ...
|
|
/// }
|
|
/// ```
|
|
///
|
|
/// This function sets the `iterVarMemDef` member to the `hlfir.declare` op
|
|
/// for `%i`.
|
|
///
|
|
/// 2. The operation(s) that update the loop's iteration variable from its
|
|
/// induction variable. For the above example, the `indVarUpdateOps` is
|
|
/// populated with the first 2 ops in the loop's body.
|
|
///
|
|
/// Note: The current implementation is dependent on how flang emits loop
|
|
/// bodies; which is sufficient for the current simple test/use cases. If this
|
|
/// proves to be insufficient, this should be made more generic.
|
|
void populateInfo(fir::DoLoopOp doLoop) {
|
|
mlir::Value result = nullptr;
|
|
|
|
// Checks if a StoreOp is updating the memref of the loop's iteration
|
|
// variable.
|
|
auto isStoringIV = [&](fir::StoreOp storeOp) {
|
|
// Direct store into the IV memref.
|
|
if (storeOp.getValue() == doLoop.getInductionVar()) {
|
|
indVarUpdateOps.push_back(storeOp);
|
|
return true;
|
|
}
|
|
|
|
// Indirect store into the IV memref.
|
|
if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
|
|
storeOp.getValue().getDefiningOp())) {
|
|
if (convertOp.getOperand() == doLoop.getInductionVar()) {
|
|
indVarUpdateOps.push_back(convertOp);
|
|
indVarUpdateOps.push_back(storeOp);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
for (mlir::Operation &op : doLoop) {
|
|
if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
|
|
if (isStoringIV(storeOp)) {
|
|
result = storeOp.getMemref();
|
|
break;
|
|
}
|
|
}
|
|
|
|
assert(result != nullptr && result.getDefiningOp() != nullptr);
|
|
iterVarMemDef = result.getDefiningOp();
|
|
}
|
|
};
|
|
|
|
using LoopNestToIndVarMap =
|
|
llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
|
|
|
|
/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
|
|
/// there are no operations in \p outerloop's body other than:
|
|
///
|
|
/// 1. the operations needed to assign/update \p outerLoop's induction variable.
|
|
/// 2. \p innerLoop itself.
|
|
///
|
|
/// \p return true if \p innerLoop is perfectly nested inside \p outerLoop
|
|
/// according to the above definition.
|
|
bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
|
|
mlir::ForwardSliceOptions forwardSliceOptions;
|
|
forwardSliceOptions.inclusive = true;
|
|
// The following will be used as an example to clarify the internals of this
|
|
// function:
|
|
// ```
|
|
// 1. fir.do_loop %i_idx = %34 to %36 step %c1 unordered {
|
|
// 2. %i_idx_2 = fir.convert %i_idx : (index) -> i32
|
|
// 3. fir.store %i_idx_2 to %i_iv#1 : !fir.ref<i32>
|
|
//
|
|
// 4. fir.do_loop %j_idx = %37 to %39 step %c1_3 unordered {
|
|
// 5. %j_idx_2 = fir.convert %j_idx : (index) -> i32
|
|
// 6. fir.store %j_idx_2 to %j_iv#1 : !fir.ref<i32>
|
|
// ... loop nest body, possible uses %i_idx ...
|
|
// }
|
|
// }
|
|
// ```
|
|
// In this example, the `j` loop is perfectly nested inside the `i` loop and
|
|
// below is how we find that.
|
|
|
|
// We don't care about the outer-loop's induction variable's uses within the
|
|
// inner-loop, so we filter out these uses.
|
|
//
|
|
// This filter tells `getForwardSlice` (below) to only collect operations
|
|
// which produce results defined above (i.e. outside) the inner-loop's body.
|
|
//
|
|
// Since `outerLoop.getInductionVar()` is a block argument (to the
|
|
// outer-loop's body), the filter effectively collects uses of
|
|
// `outerLoop.getInductionVar()` inside the outer-loop but outside the
|
|
// inner-loop.
|
|
forwardSliceOptions.filter = [&](mlir::Operation *op) {
|
|
return mlir::areValuesDefinedAbove(op->getResults(), innerLoop.getRegion());
|
|
};
|
|
|
|
llvm::SetVector<mlir::Operation *> indVarSlice;
|
|
// The forward slice of the `i` loop's IV will be the 2 ops in line 1 & 2
|
|
// above. Uses of `%i_idx` inside the `j` loop are not collected because of
|
|
// the filter.
|
|
mlir::getForwardSlice(outerLoop.getInductionVar(), &indVarSlice,
|
|
forwardSliceOptions);
|
|
llvm::DenseSet<mlir::Operation *> indVarSet(indVarSlice.begin(),
|
|
indVarSlice.end());
|
|
|
|
llvm::DenseSet<mlir::Operation *> outerLoopBodySet;
|
|
// The following walk collects ops inside `outerLoop` that are **not**:
|
|
// * the outer-loop itself,
|
|
// * or the inner-loop,
|
|
// * or the `fir.result` op (the outer-loop's terminator).
|
|
//
|
|
// For the above example, this will also populate `outerLoopBodySet` with ops
|
|
// in line 1 & 2 since we skip the `i` loop, the `j` loop, and the terminator.
|
|
outerLoop.walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *op) {
|
|
if (op == outerLoop)
|
|
return mlir::WalkResult::advance();
|
|
|
|
if (op == innerLoop)
|
|
return mlir::WalkResult::skip();
|
|
|
|
if (mlir::isa<fir::ResultOp>(op))
|
|
return mlir::WalkResult::advance();
|
|
|
|
outerLoopBodySet.insert(op);
|
|
return mlir::WalkResult::advance();
|
|
});
|
|
|
|
// If `outerLoopBodySet` ends up having the same ops as `indVarSet`, then
|
|
// `outerLoop` only contains ops that setup its induction variable +
|
|
// `innerLoop` + the `fir.result` terminator. In other words, `innerLoop` is
|
|
// perfectly nested inside `outerLoop`.
|
|
bool result = (outerLoopBodySet == indVarSet);
|
|
mlir::Location loc = outerLoop.getLoc();
|
|
LLVM_DEBUG(DBGS() << "Loop pair starting at location " << loc << " is"
|
|
<< (result ? "" : " not") << " perfectly nested\n");
|
|
|
|
return result;
|
|
}
|
|
|
|
/// Starting with `currentLoop` collect a perfectly nested loop nest, if any.
|
|
/// This function collects as much as possible loops in the nest; it case it
|
|
/// fails to recognize a certain nested loop as part of the nest it just returns
|
|
/// the parent loops it discovered before.
|
|
mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
|
|
LoopNestToIndVarMap &loopNest) {
|
|
assert(currentLoop.getUnordered());
|
|
|
|
while (true) {
|
|
loopNest.insert({currentLoop, InductionVariableInfo(currentLoop)});
|
|
llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
|
|
|
|
for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
|
|
if (nestedLoop.getUnordered())
|
|
unorderedLoops.push_back(nestedLoop);
|
|
|
|
if (unorderedLoops.empty())
|
|
break;
|
|
|
|
// Having more than one unordered loop means that we are not dealing with a
|
|
// perfect loop nest (i.e. a mulit-range `do concurrent` loop); which is the
|
|
// case we are after here.
|
|
if (unorderedLoops.size() > 1)
|
|
return mlir::failure();
|
|
|
|
fir::DoLoopOp nestedUnorderedLoop = unorderedLoops.front();
|
|
|
|
if (!isPerfectlyNested(currentLoop, nestedUnorderedLoop))
|
|
return mlir::failure();
|
|
|
|
currentLoop = nestedUnorderedLoop;
|
|
}
|
|
|
|
return mlir::success();
|
|
}
|
|
|
|
/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In
|
|
/// particular, this function would take this input IR:
|
|
/// ```
|
|
/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered {
|
|
/// fir.store %i_iv to %i#1 : !fir.ref<i32>
|
|
/// %j_lb = arith.constant 1 : i32
|
|
/// %j_ub = arith.constant 10 : i32
|
|
/// %j_step = arith.constant 1 : index
|
|
///
|
|
/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered {
|
|
/// fir.store %j_iv to %j#1 : !fir.ref<i32>
|
|
/// ...
|
|
/// }
|
|
/// }
|
|
/// ```
|
|
///
|
|
/// into the following form (using generic op form since the result is
|
|
/// technically an invalid `fir.do_loop` op:
|
|
///
|
|
/// ```
|
|
/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({
|
|
/// ^bb0(%i_iv: index):
|
|
/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32
|
|
/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32
|
|
/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index
|
|
///
|
|
/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({
|
|
/// ^bb0(%new_i_iv: index, %new_j_iv: index):
|
|
/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> ()
|
|
/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> ()
|
|
/// ...
|
|
/// })
|
|
/// ```
|
|
///
|
|
/// What happened to the loop nest is the following:
|
|
///
|
|
/// * the innermost loop's entry block was updated from having one operand to
|
|
/// having `n` operands where `n` is the number of loops in the nest,
|
|
///
|
|
/// * the outer loop(s)' ops that update the IVs were sank inside the innermost
|
|
/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above),
|
|
///
|
|
/// * the innermost loop's entry block's arguments were mapped in order from the
|
|
/// outermost to the innermost IV.
|
|
///
|
|
/// With this IR change, we can directly inline the innermost loop's region into
|
|
/// the newly generated `omp.loop_nest` op.
|
|
///
|
|
/// Note that this function has a pre-condition that \p loopNest consists of
|
|
/// perfectly nested loops; i.e. there are no in-between ops between 2 nested
|
|
/// loops except for the ops to setup the inner loop's LB, UB, and step. These
|
|
/// ops are handled/cloned by `genLoopNestClauseOps(..)`.
|
|
void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
|
|
looputils::LoopNestToIndVarMap &loopNest) {
|
|
if (loopNest.size() <= 1)
|
|
return;
|
|
|
|
fir::DoLoopOp innermostLoop = loopNest.back().first;
|
|
mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front();
|
|
|
|
llvm::SmallVector<mlir::Type> argTypes;
|
|
llvm::SmallVector<mlir::Location> argLocs;
|
|
|
|
for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) {
|
|
// Sink the IV update ops to the innermost loop. We need to do for all loops
|
|
// except for the innermost one, hence the `drop_end` usage above.
|
|
for (mlir::Operation *op : indVarInfo.indVarUpdateOps)
|
|
op->moveBefore(&innermostFirstOp);
|
|
|
|
argTypes.push_back(doLoop.getInductionVar().getType());
|
|
argLocs.push_back(doLoop.getInductionVar().getLoc());
|
|
}
|
|
|
|
mlir::Region &innermmostRegion = innermostLoop.getRegion();
|
|
// Extend the innermost entry block with arguments to represent the outer IVs.
|
|
innermmostRegion.addArguments(argTypes, argLocs);
|
|
|
|
unsigned idx = 1;
|
|
// In reverse, remap the IVs of the loop nest from the old values to the new
|
|
// ones. We do that in reverse since the first argument before this loop is
|
|
// the old IV for the innermost loop. Therefore, we want to replace it first
|
|
// before the old value (1st argument in the block) is remapped to be the IV
|
|
// of the outermost loop in the nest.
|
|
for (auto &[doLoop, _] : llvm::reverse(loopNest)) {
|
|
doLoop.getInductionVar().replaceAllUsesWith(
|
|
innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx));
|
|
++idx;
|
|
}
|
|
}
|
|
|
|
/// Collects values that are local to a loop: "loop-local values". A loop-local
|
|
/// value is one that is used exclusively inside the loop but allocated outside
|
|
/// of it. This usually corresponds to temporary values that are used inside the
|
|
/// loop body for initialzing other variables for example.
|
|
///
|
|
/// See `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90` for an
|
|
/// example of why we need this.
|
|
///
|
|
/// \param [in] doLoop - the loop within which the function searches for values
|
|
/// used exclusively inside.
|
|
///
|
|
/// \param [out] locals - the list of loop-local values detected for \p doLoop.
|
|
void collectLoopLocalValues(fir::DoLoopOp doLoop,
|
|
llvm::SetVector<mlir::Value> &locals) {
|
|
doLoop.walk([&](mlir::Operation *op) {
|
|
for (mlir::Value operand : op->getOperands()) {
|
|
if (locals.contains(operand))
|
|
continue;
|
|
|
|
bool isLocal = true;
|
|
|
|
if (!mlir::isa_and_present<fir::AllocaOp>(operand.getDefiningOp()))
|
|
continue;
|
|
|
|
// Values defined inside the loop are not interesting since they do not
|
|
// need to be localized.
|
|
if (doLoop->isAncestor(operand.getDefiningOp()))
|
|
continue;
|
|
|
|
for (auto *user : operand.getUsers()) {
|
|
if (!doLoop->isAncestor(user)) {
|
|
isLocal = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (isLocal)
|
|
locals.insert(operand);
|
|
}
|
|
});
|
|
}
|
|
|
|
/// For a "loop-local" value \p local within a loop's scope, localizes that
|
|
/// value within the scope of the parallel region the loop maps to. Towards that
|
|
/// end, this function moves the allocation of \p local within \p allocRegion.
|
|
///
|
|
/// \param local - the value used exclusively within a loop's scope (see
|
|
/// collectLoopLocalValues).
|
|
///
|
|
/// \param allocRegion - the parallel region where \p local's allocation will be
|
|
/// privatized.
|
|
///
|
|
/// \param rewriter - builder used for updating \p allocRegion.
|
|
static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
|
|
mlir::ConversionPatternRewriter &rewriter) {
|
|
rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front());
|
|
}
|
|
} // namespace looputils
|
|
|
|
class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
|
|
public:
|
|
using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;
|
|
|
|
DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice,
|
|
llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip)
|
|
: OpConversionPattern(context), mapToDevice(mapToDevice),
|
|
concurrentLoopsToSkip(concurrentLoopsToSkip) {}
|
|
|
|
mlir::LogicalResult
|
|
matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
|
|
mlir::ConversionPatternRewriter &rewriter) const override {
|
|
if (mapToDevice)
|
|
return doLoop.emitError(
|
|
"not yet implemented: Mapping `do concurrent` loops to device");
|
|
|
|
looputils::LoopNestToIndVarMap loopNest;
|
|
bool hasRemainingNestedLoops =
|
|
failed(looputils::collectLoopNest(doLoop, loopNest));
|
|
if (hasRemainingNestedLoops)
|
|
mlir::emitWarning(doLoop.getLoc(),
|
|
"Some `do concurent` loops are not perfectly-nested. "
|
|
"These will be serialized.");
|
|
|
|
llvm::SetVector<mlir::Value> locals;
|
|
looputils::collectLoopLocalValues(loopNest.back().first, locals);
|
|
looputils::sinkLoopIVArgs(rewriter, loopNest);
|
|
|
|
mlir::IRMapping mapper;
|
|
mlir::omp::ParallelOp parallelOp =
|
|
genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
|
|
mlir::omp::LoopNestOperands loopNestClauseOps;
|
|
genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
|
|
loopNestClauseOps);
|
|
|
|
for (mlir::Value local : locals)
|
|
looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
|
|
rewriter);
|
|
|
|
mlir::omp::LoopNestOp ompLoopNest =
|
|
genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
|
|
/*isComposite=*/mapToDevice);
|
|
|
|
rewriter.eraseOp(doLoop);
|
|
|
|
// Mark `unordered` loops that are not perfectly nested to be skipped from
|
|
// the legality check of the `ConversionTarget` since we are not interested
|
|
// in mapping them to OpenMP.
|
|
ompLoopNest->walk([&](fir::DoLoopOp doLoop) {
|
|
if (doLoop.getUnordered()) {
|
|
concurrentLoopsToSkip.insert(doLoop);
|
|
}
|
|
});
|
|
|
|
return mlir::success();
|
|
}
|
|
|
|
private:
|
|
mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
|
|
mlir::ConversionPatternRewriter &rewriter,
|
|
looputils::LoopNestToIndVarMap &loopNest,
|
|
mlir::IRMapping &mapper) const {
|
|
auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
|
|
rewriter.createBlock(¶llelOp.getRegion());
|
|
rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
|
|
|
|
genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
|
|
return parallelOp;
|
|
}
|
|
|
|
void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
|
|
looputils::LoopNestToIndVarMap &loopNest,
|
|
mlir::IRMapping &mapper) const {
|
|
|
|
for (auto &[_, indVarInfo] : loopNest)
|
|
genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
|
|
}
|
|
|
|
mlir::Operation *
|
|
genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
|
|
mlir::Operation *indVarMemDef,
|
|
mlir::IRMapping &mapper) const {
|
|
assert(
|
|
indVarMemDef != nullptr &&
|
|
"Induction variable memdef is expected to have a defining operation.");
|
|
|
|
llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
|
|
for (auto operand : indVarMemDef->getOperands())
|
|
indVarDeclareAndAlloc.insert(operand.getDefiningOp());
|
|
indVarDeclareAndAlloc.insert(indVarMemDef);
|
|
|
|
mlir::Operation *result;
|
|
for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
|
|
result = rewriter.clone(*opToClone, mapper);
|
|
|
|
return result;
|
|
}
|
|
|
|
void genLoopNestClauseOps(
|
|
mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
|
|
looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
|
|
mlir::omp::LoopNestOperands &loopNestClauseOps) const {
|
|
assert(loopNestClauseOps.loopLowerBounds.empty() &&
|
|
"Loop nest bounds were already emitted!");
|
|
|
|
auto populateBounds = [](mlir::Value var,
|
|
llvm::SmallVectorImpl<mlir::Value> &bounds) {
|
|
bounds.push_back(var.getDefiningOp()->getResult(0));
|
|
};
|
|
|
|
for (auto &[doLoop, _] : loopNest) {
|
|
populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds);
|
|
populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds);
|
|
populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps);
|
|
}
|
|
|
|
loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
|
|
}
|
|
|
|
mlir::omp::LoopNestOp
|
|
genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
|
|
mlir::IRMapping &mapper,
|
|
const mlir::omp::LoopNestOperands &clauseOps,
|
|
bool isComposite) const {
|
|
|
|
auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
|
|
wsloopOp.setComposite(isComposite);
|
|
rewriter.createBlock(&wsloopOp.getRegion());
|
|
|
|
auto loopNestOp =
|
|
rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
|
|
|
|
// Clone the loop's body inside the loop nest construct using the
|
|
// mapped values.
|
|
rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
|
|
loopNestOp.getRegion().begin(), mapper);
|
|
|
|
mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
|
|
rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
|
|
rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
|
|
rewriter.eraseOp(terminator);
|
|
|
|
return loopNestOp;
|
|
}
|
|
|
|
bool mapToDevice;
|
|
llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip;
|
|
};
|
|
|
|
class DoConcurrentConversionPass
|
|
: public flangomp::impl::DoConcurrentConversionPassBase<
|
|
DoConcurrentConversionPass> {
|
|
public:
|
|
DoConcurrentConversionPass() = default;
|
|
|
|
DoConcurrentConversionPass(
|
|
const flangomp::DoConcurrentConversionPassOptions &options)
|
|
: DoConcurrentConversionPassBase(options) {}
|
|
|
|
void runOnOperation() override {
|
|
mlir::func::FuncOp func = getOperation();
|
|
|
|
if (func.isDeclaration())
|
|
return;
|
|
|
|
mlir::MLIRContext *context = &getContext();
|
|
|
|
if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host &&
|
|
mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) {
|
|
mlir::emitWarning(mlir::UnknownLoc::get(context),
|
|
"DoConcurrentConversionPass: invalid `map-to` value. "
|
|
"Valid values are: `host` or `device`");
|
|
return;
|
|
}
|
|
|
|
llvm::DenseSet<fir::DoLoopOp> concurrentLoopsToSkip;
|
|
mlir::RewritePatternSet patterns(context);
|
|
patterns.insert<DoConcurrentConversion>(
|
|
context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
|
|
concurrentLoopsToSkip);
|
|
mlir::ConversionTarget target(*context);
|
|
target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
|
|
// The goal is to handle constructs that eventually get lowered to
|
|
// `fir.do_loop` with the `unordered` attribute (e.g. array expressions).
|
|
// Currently, this is only enabled for the `do concurrent` construct since
|
|
// the pass runs early in the pipeline.
|
|
return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
|
|
});
|
|
target.markUnknownOpDynamicallyLegal(
|
|
[](mlir::Operation *) { return true; });
|
|
|
|
if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
|
|
std::move(patterns)))) {
|
|
signalPassFailure();
|
|
}
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
std::unique_ptr<mlir::Pass>
|
|
flangomp::createDoConcurrentConversionPass(bool mapToDevice) {
|
|
DoConcurrentConversionPassOptions options;
|
|
options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device
|
|
: flangomp::DoConcurrentMappingKind::DCMK_Host;
|
|
|
|
return std::make_unique<DoConcurrentConversionPass>(options);
|
|
}
|