Files
clang-p2996/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
2023-01-04 07:36:11 -08:00

684 lines
27 KiB
C++

//===- LoopEmitter.cpp ----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "LoopEmitter.h"
#include "CodegenUtils.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
using namespace mlir;
using namespace mlir::sparse_tensor;
//===----------------------------------------------------------------------===//
// File local helper functions.
//===----------------------------------------------------------------------===//
/// Generates a pointer/index load from the sparse storage scheme. Narrower
/// data types need to be zero extended before casting the value into the
/// index type used for looping and indexing.
static Value genIndexLoad(OpBuilder &builder, Location loc, Value ptr,
Value s) {
// For the scalar case, we simply zero extend narrower indices into 64-bit
// values before casting to index without a performance penalty. Here too,
// however, indices that already are 64-bit, in theory, cannot express the
// full range as explained above.
Value load = builder.create<memref::LoadOp>(loc, ptr, s);
if (!load.getType().isa<IndexType>()) {
if (load.getType().getIntOrFloatBitWidth() < 64)
load = builder.create<arith::ExtUIOp>(loc, builder.getI64Type(), load);
load =
builder.create<arith::IndexCastOp>(loc, builder.getIndexType(), load);
}
return load;
}
//===----------------------------------------------------------------------===//
// Sparse tensor loop emitter class implementations
//===----------------------------------------------------------------------===//
Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, size_t tid,
size_t dim, Value iv) {
Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1];
Value mul = builder.create<arith::MulIOp>(loc, highs[tid][dim], p);
Value add = builder.create<arith::AddIOp>(loc, mul, iv);
return add;
}
LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput,
bool isSparseOut, ArrayRef<unsigned> topSort) {
initialize(tensors, loopTag, hasOutput, isSparseOut, topSort);
}
void LoopEmitter::initialize(ValueRange tensors, StringAttr loopTag,
bool hasOutput, bool isSparseOut,
ArrayRef<unsigned> topSort) {
// First initializes fields.
this->loopTag = loopTag;
this->hasOutput = hasOutput;
this->isSparseOut = isSparseOut;
this->tensors.assign(tensors.begin(), tensors.end());
this->dimTypes.assign(tensors.size(), std::vector<DimLevelType>());
this->pidxs.assign(tensors.size(), std::vector<Value>());
this->coord.assign(tensors.size(), std::vector<Value>());
this->highs.assign(tensors.size(), std::vector<Value>());
this->ptrBuffer.assign(tensors.size(), std::vector<Value>());
this->idxBuffer.assign(tensors.size(), std::vector<Value>());
this->valBuffer.assign(tensors.size(), nullptr);
this->loopStack.reserve(topSort.size());
this->sparsiferLoopLvlMap.assign(topSort.size(), 0);
for (size_t tid = 0, e = tensors.size(); tid < e; tid++) {
auto t = tensors[tid];
// a scalar or 0-dimension tensors
if (isZeroRankedTensorOrScalar(t.getType()))
continue;
auto rtp = t.getType().cast<RankedTensorType>();
auto rank = static_cast<size_t>(rtp.getRank());
auto enc = getSparseTensorEncoding(rtp);
// We always treat sparse output tensor as dense so that we always iterate
// it based on dim size.
if (enc && !(isOutputTensor(tid) && isSparseOut))
for (auto dimTp : enc.getDimLevelType())
dimTypes[tid].push_back(dimTp);
else
dimTypes[tid].assign(rank, DimLevelType::Dense);
// Initialize using empty value.
pidxs[tid].assign(rank, Value());
coord[tid].assign(rank, Value());
highs[tid].assign(rank, Value());
ptrBuffer[tid].assign(rank, Value());
idxBuffer[tid].assign(rank, Value());
}
// FIXME: This map should be maintained outside loop emitter.
for (unsigned i = 0, e = topSort.size(); i < e; i++) {
// This is an inverse map of the topologically sorted loop index from
// sparsifier. This is needed to map the AffineDimExpr back to the loopStack
// index used in loop emitter.
sparsiferLoopLvlMap[topSort[i]] = i;
}
}
void LoopEmitter::initializeLoopEmit(OpBuilder &builder, Location loc,
LoopEmitter::OutputUpdater updater) {
// For every tensor, find lower and upper bound on dimensions, set the
// same bounds on loop indices, and obtain dense or sparse buffer(s).
for (size_t t = 0, e = tensors.size(); t < e; t++) {
auto tensor = tensors[t];
auto rtp = tensor.getType().dyn_cast<RankedTensorType>();
if (!rtp)
// Skips only scalar, zero ranked tensor still need to be bufferized and
// (probably) filled with zeros by users.
continue;
auto rank = rtp.getRank();
auto shape = rtp.getShape();
auto enc = getSparseTensorEncoding(rtp);
uint64_t cooStart = enc ? getCOOStart(enc) : rank;
// Scan all dimensions of current tensor.
for (int64_t d = 0; d < rank; d++) {
// This should be called only once at beginning.
assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !highs[t][d]);
// Handle sparse storage schemes.
if (isCompressedDLT(dimTypes[t][d])) {
// Generate sparse primitives to obtains pointer and indices.
ptrBuffer[t][d] = genToPointers(builder, loc, tensor, d);
idxBuffer[t][d] = genToIndices(builder, loc, tensor, d, cooStart);
} else if (isSingletonDLT(dimTypes[t][d])) {
// Singleton dimension, fetch indices.
idxBuffer[t][d] = genToIndices(builder, loc, tensor, d, cooStart);
} else {
// Dense dimension, nothing to fetch.
assert(isDenseDLT(dimTypes[t][d]));
}
// Find upper bound in current dimension.
unsigned p = toOrigDim(enc, d);
Value up = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, p);
highs[t][d] = up;
}
// Perform the required bufferization. Dense inputs materialize
// from the input tensors. Sparse inputs use sparse primitives to obtain the
// values.
// Delegates extra output initialization to clients.
bool isOutput = isOutputTensor(t);
Type elementType = rtp.getElementType();
if (!enc) {
// Non-annotated dense tensors.
auto denseTp = MemRefType::get(shape, elementType);
Value denseVal =
builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
// Dense outputs need special handling.
if (isOutput && updater)
denseVal = updater(builder, loc, denseVal, tensor);
valBuffer[t] = denseVal;
} else {
// Annotated sparse tensors.
// We also need the value buffer for annotated all dense `sparse` tensor.
auto dynShape = {ShapedType::kDynamic};
auto sparseTp = MemRefType::get(dynShape, elementType);
valBuffer[t] = builder.create<ToValuesOp>(loc, sparseTp, tensor);
}
// NOTE: we can also prepare for 0 dim here in advance, this will hosit
// some loop preparation from tensor iteration, but will also (undesirably)
// hosit the code ouside if conditions.
}
}
void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc,
ArrayRef<size_t> tids,
ArrayRef<size_t> dims) {
// Universal Index start from 0
assert(loopSeqStack.size() == loopStack.size());
// Universal index starts from 0
loopSeqStack.emplace_back(constantIndex(builder, loc, 0));
// Prepares for all the tensors used in the current loop sequence.
for (auto [tid, dim] : llvm::zip(tids, dims))
prepareLoopOverTensorAtDim(builder, loc, tid, dim);
}
Value LoopEmitter::genAffine(OpBuilder &builder, AffineExpr a, Location loc) {
switch (a.getKind()) {
case AffineExprKind::DimId: {
unsigned idx = a.cast<AffineDimExpr>().getPosition();
return loopStack[sparsiferLoopLvlMap[idx]].iv;
}
case AffineExprKind::Add: {
auto binOp = a.cast<AffineBinaryOpExpr>();
return builder.create<arith::AddIOp>(
loc, genAffine(builder, binOp.getLHS(), loc),
genAffine(builder, binOp.getRHS(), loc));
}
case AffineExprKind::Mul: {
auto binOp = a.cast<AffineBinaryOpExpr>();
return builder.create<arith::MulIOp>(
loc, genAffine(builder, binOp.getLHS(), loc),
genAffine(builder, binOp.getRHS(), loc));
}
case AffineExprKind::Constant: {
int64_t c = a.cast<AffineConstantExpr>().getValue();
return constantIndex(builder, loc, c);
}
default:
llvm_unreachable("unexpected affine subscript");
}
}
Operation *LoopEmitter::enterLoopOverTensorAtDim(
OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
ArrayRef<size_t> dims, MutableArrayRef<Value> reduc, bool isParallel) {
// TODO: support multiple return on parallel for?
assert(!isParallel || reduc.size() <= 1);
bool isSparseInput = false;
size_t tid = tids.front(), dim = dims.front();
for (auto [t, d] : llvm::zip(tids, dims)) {
assert(dimTypes[t].size() > d); // Must be a valid tid, dim pair
assert(!coord[t][d]); // We cannot re-enter the same level
auto dimType = dimTypes[t][d];
// Must be a recognizable DLT.
assert(isDenseDLT(dimType) || isCompressedDLT(dimType) ||
isSingletonDLT(dimType));
bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType);
// We can at most have one sparse input, otherwise, a while loop is required
// to co-iterate multiple sparse tensors.
assert(!isSparseInput || !isSparse);
if (isSparse) {
tid = t;
dim = d;
}
isSparseInput = isSparseInput || isSparse;
}
Value step = constantIndex(builder, loc, 1);
Value lo = isSparseInput ? pidxs[tid][dim] // current offset
: loopSeqStack.back(); // univeral tid
Value hi = highs[tid][dim];
Operation *loop = nullptr;
Value iv;
if (isParallel) {
scf::ParallelOp parOp =
builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
builder.setInsertionPointToStart(parOp.getBody());
assert(parOp.getNumReductions() == reduc.size());
iv = parOp.getInductionVars()[0];
// In-place update on the reduction variable vector.
// Note that the init vals is not the actual reduction variables but instead
// used as a `special handle` to (temporarily) represent them. The
// expression on init vals will be moved into scf.reduce and replaced with
// the block arguments when exiting the loop (see exitForLoop). This is
// needed as we can not build the actual reduction block and get the actual
// reduction varaible before users fill parallel loop body.
for (int i = 0, e = reduc.size(); i < e; i++)
reduc[i] = parOp.getInitVals()[i];
loop = parOp;
} else {
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
builder.setInsertionPointToStart(forOp.getBody());
iv = forOp.getInductionVar();
// In-place update on the reduction variable vector.
assert(forOp.getNumRegionIterArgs() == reduc.size());
for (int i = 0, e = reduc.size(); i < e; i++)
reduc[i] = forOp.getRegionIterArg(i);
loop = forOp;
}
assert(loop && iv);
if (isSparseInput) {
pidxs[tid][dim] = iv;
// Generating a load on the indices array yields the coordinate.
Value ptr = idxBuffer[tid][dim];
coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv);
} else {
// Dense tensor, the coordinates is the inducation variable.
coord[tid][dim] = iv;
}
// NOTE: we can also prepare for next dim here in advance
// Push the loop into stack
loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), loop,
coord[tid][dim], loopTag);
// Emit extra locals.
emitExtraLocalsForTensorsAtDenseDims(builder, loc, tids, dims);
return loop;
}
Operation *LoopEmitter::enterFilterLoopOverTensorAtDim(
OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine,
MutableArrayRef<Value> reduc) {
assert(!affine.isa<AffineDimExpr>() && !isDenseDLT(dimTypes[tid][dim]));
assert(dimTypes[tid].size() > dim);
// We can not re-enter the same level.
assert(!coord[tid][dim]);
Value step = constantIndex(builder, loc, 1);
Value lo = pidxs[tid][dim];
Value hi = highs[tid][dim];
// TODO: We should instead use a whileOp for filter loop to allow early
// break when exceeding (for ordered dimensions).
// TODO: There are many other potiential opportunities that we might apply in
// the future. E.g., we could use binary search to located the pointer index.
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
// In-place update on the reduction variable vector.
assert(forOp.getNumRegionIterArgs() == reduc.size());
for (int i = 0, e = reduc.size(); i < e; i++)
reduc[i] = forOp.getRegionIterArg(i);
builder.setInsertionPointToStart(forOp.getBody());
Value iv = forOp.getInductionVar();
pidxs[tid][dim] = iv;
// Generating a load on the indices array yields the coordinate.
Value ptr = idxBuffer[tid][dim];
coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv);
// Generate an if condition to filter out indices that is not equal to the
// result of the affine expression.
Value expected = genAffine(builder, affine, loc);
auto pred = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
coord[tid][dim], expected);
SmallVector<Type> types;
for (Value red : reduc) {
types.push_back(red.getType());
}
bool hasReduc = !types.empty();
scf::IfOp ifOp =
builder.create<scf::IfOp>(loc, types, pred, /*else*/ hasReduc);
if (hasReduc) {
// scf.for (a) -> v
// %s = scf.if (a) -> v
// user-generated code.
// else
// yield a
// yield %s
builder.create<scf::YieldOp>(loc, ifOp.getResults());
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
// On mismatch.
builder.create<scf::YieldOp>(loc, reduc);
}
// Set the insert point to matched branch.
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
// NOTE: we can also prepare for next dim here in advance
// Push the loop into stack
loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), forOp,
coord[tid][dim], nullptr);
return forOp;
}
void LoopEmitter::genDenseAffineAddressAtCurLevel(OpBuilder &builder,
Location loc, size_t tid,
size_t dim,
AffineExpr affine) {
Value affineV = genAffine(builder, affine, loc);
pidxs[tid][dim] = genAddress(builder, loc, tid, dim, affineV);
}
Operation *LoopEmitter::enterCoIterationOverTensorsAtDims(
OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc) {
assert(tids.size() == dims.size());
SmallVector<Type> types;
SmallVector<Value> operands;
// Construct the while-loop with a parameter for each index.
Type indexType = builder.getIndexType();
for (auto [tid, dim] : llvm::zip(tids, dims)) {
if (isCompressedDLT(dimTypes[tid][dim]) ||
isSingletonDLT(dimTypes[tid][dim])) {
assert(pidxs[tid][dim]);
types.push_back(indexType);
operands.push_back(pidxs[tid][dim]);
}
}
// The position where user-supplied reduction variable starts.
for (Value rec : reduc) {
types.push_back(rec.getType());
operands.push_back(rec);
}
if (needsUniv) {
types.push_back(indexType);
// Update universal index.
operands.push_back(loopSeqStack.back());
}
assert(types.size() == operands.size());
scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
SmallVector<Location> locs(types.size(), loc);
Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
// Build the "before" region, which effectively consists
// of a conjunction of "i < upper" tests on all induction.
builder.setInsertionPointToStart(&whileOp.getBefore().front());
Value cond;
unsigned o = 0;
for (auto [tid, dim] : llvm::zip(tids, dims)) {
if (isCompressedDLT(dimTypes[tid][dim]) ||
isSingletonDLT(dimTypes[tid][dim])) {
Value op1 = before->getArgument(o);
Value op2 = highs[tid][dim];
Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
op1, op2);
cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc;
// Update
pidxs[tid][dim] = after->getArgument(o++);
}
}
builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
// Generates while body.
builder.setInsertionPointToStart(&whileOp.getAfter().front());
Value min;
for (auto [tid, dim] : llvm::zip(tids, dims)) {
// Prepares for next level.
if (isCompressedDLT(dimTypes[tid][dim]) ||
isSingletonDLT(dimTypes[tid][dim])) {
Value ptr = idxBuffer[tid][dim];
Value s = pidxs[tid][dim];
Value load = genIndexLoad(builder, loc, ptr, s);
coord[tid][dim] = load;
if (!needsUniv) {
if (min) {
Value cmp = builder.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::ult, load, min);
min = builder.create<arith::SelectOp>(loc, cmp, load, min);
} else {
min = load;
}
}
}
}
if (needsUniv) {
assert(!min);
// Otherwise, universal index is the minimal pidx.
min = after->getArguments().back();
}
// Sets up the loop stack.
loopStack.emplace_back(tids, dims, whileOp, min, loopTag);
assert(loopStack.size() == loopSeqStack.size());
// Emits extra locals
emitExtraLocalsForTensorsAtDenseDims(builder, loc, tids, dims);
// Updates reduction variables
assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0));
// In-place update on reduction variable.
for (unsigned i = 0, e = reduc.size(); i < e; i++)
reduc[i] = after->getArgument(o + i);
return whileOp;
}
void LoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc,
size_t tid, size_t dim) {
assert(dimTypes[tid].size() > dim);
auto dimType = dimTypes[tid][dim];
if (isDenseDLT(dimType))
return;
// Either the first dimension, or the previous dimension has been set.
assert(dim == 0 || pidxs[tid][dim - 1]);
Value c0 = constantIndex(builder, loc, 0);
Value c1 = constantIndex(builder, loc, 1);
if (isCompressedDLT(dimType)) {
Value ptr = ptrBuffer[tid][dim];
Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
pidxs[tid][dim] = genIndexLoad(builder, loc, ptr, pLo);
Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
highs[tid][dim] = genIndexLoad(builder, loc, ptr, pHi);
return;
}
if (isSingletonDLT(dimType)) {
Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
pidxs[tid][dim] = pLo;
highs[tid][dim] = pHi;
return;
}
llvm_unreachable("Unrecognizable dimesion type!");
}
void LoopEmitter::emitExtraLocalsForTensorsAtDenseDims(OpBuilder &builder,
Location loc,
ArrayRef<size_t> tids,
ArrayRef<size_t> dims) {
// Initialize dense positions. Note that we generate dense indices of the
// output tensor unconditionally, since they may not appear in the lattice,
// but may be needed for linearized codegen.
for (auto [tid, dim] : llvm::zip(tids, dims)) {
if (isDenseDLT(dimTypes[tid][dim])) {
auto enc = getSparseTensorEncoding(tensors[tid].getType());
if (enc && !isSparseOutput(tid)) {
bool validPidx = dim == 0 || pidxs[tid][dim - 1];
if (!validPidx) {
// We might not find the pidx for the sparse output tensor as it is
// unconditionally required by the sparsification.
assert(isOutputTensor(tid));
continue;
}
pidxs[tid][dim] =
genAddress(builder, loc, tid, dim, loopStack.back().iv);
// NOTE: we can also prepare for next dim here in advance
}
}
}
}
void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
MutableArrayRef<Value> reduc) {
LoopLevelInfo &loopInfo = loopStack.back();
auto &dims = loopStack.back().dims;
auto &tids = loopStack.back().tids;
auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop);
if (forOp) {
if (!reduc.empty()) {
assert(reduc.size() == forOp.getNumResults());
rewriter.create<scf::YieldOp>(loc, reduc);
}
// Exit the loop.
rewriter.setInsertionPointAfter(forOp);
// In-place update reduction variables.
for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
reduc[i] = forOp.getResult(i);
} else {
auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
if (!reduc.empty()) {
assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
Operation *redExp = reduc.front().getDefiningOp();
// Reduction expression should have no use.
assert(redExp->getUses().empty());
// This must be a binary operation.
// NOTE: This is users' responsibilty to ensure the operation are
// commutative.
assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
Value redVal = parOp.getInitVals().front();
Value curVal;
if (redExp->getOperand(0) == redVal)
curVal = redExp->getOperand(1);
else if (redExp->getOperand(1) == redVal)
curVal = redExp->getOperand(0);
// One of the operands must be the init value (which is also the
// previous reduction value).
assert(curVal);
// The reduction expression should be the only user of the reduction val
// inside the parallel for.
unsigned numUsers = 0;
for (Operation *op : redVal.getUsers()) {
if (op->getParentOp() == parOp)
numUsers++;
}
assert(numUsers == 1);
(void)numUsers; // to silence unused variable warning in release build
rewriter.setInsertionPointAfter(redExp);
auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
// Attach to the reduction op.
Block *redBlock = &redOp.getRegion().getBlocks().front();
rewriter.setInsertionPointToEnd(redBlock);
Operation *newRed = rewriter.clone(*redExp);
// Replaces arguments of the reduction expression by using the block
// arguments from scf.reduce.
rewriter.updateRootInPlace(
newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
// Erases the out-dated reduction expression.
rewriter.eraseOp(redExp);
rewriter.setInsertionPointToEnd(redBlock);
rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
}
rewriter.setInsertionPointAfter(parOp);
// In-place update reduction variables.
for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
reduc[i] = parOp.getResult(i);
}
// Finished iterating a tensor, clean up
// We only do the clean up on for loop as while loops do not necessarily
// finish the iteration on a sparse tensor
for (auto [tid, dim] : llvm::zip(tids, dims)) {
// Reset to null.
coord[tid][dim] = Value();
pidxs[tid][dim] = Value();
// Dense dimension, high is fixed.
if (!isDenseDLT(dimTypes[tid][dim]))
highs[tid][dim] = Value();
}
}
void LoopEmitter::exitCoIterationLoop(OpBuilder &builder, Location loc,
MutableArrayRef<Value> reduc) {
auto whileOp = llvm::cast<scf::WhileOp>(loopStack.back().loop);
auto &dims = loopStack.back().dims;
auto &tids = loopStack.back().tids;
Value iv = loopStack.back().iv;
// Generation while loop induction at the end.
builder.setInsertionPointToEnd(&whileOp.getAfter().front());
// Finalize the induction. Note that the induction could be performed
// in the individual if-branches to avoid re-evaluating the conditions.
// However, that would result in a rather elaborate forest of yield
// instructions during code generation. Moreover, performing the induction
// after the if-statements more closely resembles code generated by TACO.
unsigned o = 0;
SmallVector<Value> operands;
Value one = constantIndex(builder, loc, 1);
for (auto [tid, dim] : llvm::zip(tids, dims)) {
if (isCompressedDLT(dimTypes[tid][dim]) ||
isSingletonDLT(dimTypes[tid][dim])) {
Value op1 = coord[tid][dim];
Value op3 = pidxs[tid][dim];
Value cmp =
builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, op1, iv);
Value add = builder.create<arith::AddIOp>(loc, op3, one);
operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, op3));
// Following loops continue iteration from the break point of the
// current while loop.
pidxs[tid][dim] = whileOp->getResult(o++);
// The coordinates are invalid now.
coord[tid][dim] = nullptr;
// highs remains unchanged.
}
}
// Reduction value from users.
for (auto &i : reduc) {
operands.push_back(i);
// In place update reduction variable.
i = whileOp->getResult(o++);
}
// An (optional) universal index.
if (operands.size() < whileOp.getNumResults()) {
assert(operands.size() + 1 == whileOp.getNumResults());
// The last one is the universial index.
operands.push_back(builder.create<arith::AddIOp>(loc, iv, one));
// update the loop starting point of current loop sequence
loopSeqStack.back() = whileOp->getResult(o++);
}
assert(o == operands.size());
builder.create<scf::YieldOp>(loc, operands);
builder.setInsertionPointAfter(whileOp);
}
void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc,
MutableArrayRef<Value> reduc) {
// Clean up the values, it would help use to discover potential bug at a
// earlier stage (instead of silently using a wrong value).
LoopLevelInfo &loopInfo = loopStack.back();
assert(loopInfo.tids.size() == loopInfo.dims.size());
SmallVector<Value> red;
if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
exitCoIterationLoop(rewriter, loc, reduc);
} else {
exitForLoop(rewriter, loc, reduc);
}
assert(loopStack.size() == loopSeqStack.size());
loopStack.pop_back();
}