The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Caveats include: - This clang-tidy script probably has more problems. - This only touches C++ code, so nothing that is being generated. Context: - https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" - Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This first patch was created with the following steps. The intention is to only do automated changes at first, so I waste less time if it's reverted, and so the first mass change is more clear as an example to other teams that will need to follow similar steps. Steps are described per line, as comments are removed by git: 0. Retrieve the change from the following to build clang-tidy with an additional check: https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check 1. Build clang-tidy 2. Run clang-tidy over your entire codebase while disabling all checks and enabling the one relevant one. Run on all header files also. 3. Delete .inc files that were also modified, so the next build rebuilds them to a pure state. 4. Some changes have been deleted for the following reasons: - Some files had a variable also named cast - Some files had not included a header file that defines the cast functions - Some files are definitions of the classes that have the casting methods, so the code still refers to the method instead of the function without adding a prefix or removing the method declaration at the same time. ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -header-filter=mlir/ mlir/* -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc git restore mlir/lib/IR mlir/lib/Dialect/DLTI/DLTI.cpp\ mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp\ mlir/lib/**/IR/\ mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp\ mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp\ mlir/test/lib/Dialect/Test/TestTypes.cpp\ mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp\ mlir/test/lib/Dialect/Test/TestAttributes.cpp\ mlir/unittests/TableGen/EnumsGenTest.cpp\ mlir/test/python/lib/PythonTestCAPI.cpp\ mlir/include/mlir/IR/ ``` Differential Revision: https://reviews.llvm.org/D150123
2121 lines
85 KiB
C++
2121 lines
85 KiB
C++
//===- LoopEmitter.cpp ----------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "LoopEmitter.h"
|
|
#include "CodegenUtils.h"
|
|
|
|
#include "mlir/Dialect/Arith/IR/Arith.h"
|
|
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
|
|
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
|
#include "mlir/Dialect/Linalg/Utils/Utils.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/Dialect/SCF/IR/SCF.h"
|
|
#include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
|
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::sparse_tensor;
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// File local shorthand macros
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#define CMPI(p, l, r) \
|
|
(builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, (l), (r)) \
|
|
.getResult())
|
|
|
|
#define C_IDX(v) (constantIndex(builder, loc, (v)))
|
|
#define YIELD(vs) (builder.create<scf::YieldOp>(loc, (vs)))
|
|
#define ADDI(lhs, rhs) (builder.create<arith::AddIOp>(loc, (lhs), (rhs)))
|
|
#define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs)))
|
|
#define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs)))
|
|
#define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs)))
|
|
#define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r)))
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// File local helper functions.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor,
|
|
Level lvl) {
|
|
auto enc = getSparseTensorEncoding(tensor.getType());
|
|
// FIXME: `toOrigDim` is deprecated
|
|
return createOrFoldSliceOffsetOp(builder, loc, tensor, toOrigDim(enc, lvl));
|
|
}
|
|
|
|
static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor,
|
|
Level lvl) {
|
|
auto enc = getSparseTensorEncoding(tensor.getType());
|
|
// FIXME: `toOrigDim` is deprecated
|
|
return createOrFoldSliceStrideOp(builder, loc, tensor, toOrigDim(enc, lvl));
|
|
}
|
|
|
|
/// Converts a coordinate relative to the slice to the coordinate relative
|
|
/// to the underlying tensor.
|
|
// FIXME: that description says "sliceCrd -> tensorCrd"; but the function
|
|
// name suggests it should be "tensorCrd -> sliceCrd".
|
|
static Value toSliceCrd(OpBuilder &builder, Location loc, Value crd,
|
|
Value offset, Value stride, Value tensor, Level lvl) {
|
|
// tensorCrd = sliceCrd * stride + offset
|
|
return ADDI(MULI(crd, stride), offset);
|
|
}
|
|
|
|
/// Generates code to compute the *absolute* offset of the slice based on the
|
|
/// provide minimum coordinates in the slice.
|
|
/// E.g., when reducing d0 + d1 + d2, we need two slices to fully reduced the
|
|
/// expression, i,e, s1 = slice(T, d0), s2 = slice(s1, d1). The *absolute*
|
|
/// offset is the offset computed relative to the initial tensors T.
|
|
///
|
|
/// When isNonEmpty == true, the computed offset is meaningless and should not
|
|
/// be used during runtime, the method generates code to return 0 currently in
|
|
/// that case.
|
|
///
|
|
/// offset = isNonEmpty && minCrd >= size ? minCrd - size + 1 : 0;
|
|
static Value offsetFromMinCoord(OpBuilder &builder, Location loc, Value minCrd,
|
|
Value size, Value isNonEmpty) {
|
|
Value geSize = CMPI(uge, minCrd, size);
|
|
Value pred = ANDI(isNonEmpty, geSize);
|
|
// Computes minCrd - size + 1
|
|
Value mms = SUBI(ADDI(minCrd, C_IDX(1)), size);
|
|
// This is the absolute offset related to the underly tensor.
|
|
return SELECT(pred, mms, C_IDX(0));
|
|
}
|
|
|
|
/// Converts a coordinate relative to the underlying tensor to the coordinate
|
|
/// relative to the slice, returns a extra reminder value
|
|
// FIXME: that description says "tensorCrd -> sliceCrd"; but the function
|
|
// name suggests it should be "sliceCrd -> tensorCrd".
|
|
static std::pair<Value, Value> fromSliceCrd(OpBuilder &builder, Location loc,
|
|
Value crd, Value offset,
|
|
Value stride, Value tensor,
|
|
Level lvl) {
|
|
// sliceCrd = (tensorCrd - offset) / stride
|
|
crd = SUBI(crd, offset);
|
|
Value rem = builder.create<arith::RemUIOp>(loc, crd, stride);
|
|
crd = builder.create<arith::DivUIOp>(loc, crd, stride);
|
|
return std::make_pair(crd, rem);
|
|
}
|
|
|
|
std::pair<Value, Value>
|
|
LoopEmitter::genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd,
|
|
TensorId tid, Level lvl) {
|
|
assert(isSparseSlices[tid]);
|
|
Value slice = tensors[tid];
|
|
Value offset = sliceOffsets[tid][lvl];
|
|
Value stride = sliceStrides[tid][lvl];
|
|
auto enc = getSparseTensorEncoding(slice.getType());
|
|
|
|
const auto [newCrd, crdRem] =
|
|
fromSliceCrd(builder, loc, crd, offset, stride, slice, lvl);
|
|
|
|
SmallVector<Value, 3> conds; // at most 3 conditions
|
|
|
|
// First, coord >= offset (skip the check if offset is known to be 0).
|
|
if (auto staticOffset = enc.getStaticLvlSliceOffset(lvl);
|
|
!(staticOffset.has_value() && *staticOffset == 0)) {
|
|
auto geOffset = CMPI(uge, crd, offset);
|
|
conds.push_back(geOffset);
|
|
}
|
|
|
|
// Second, coord_in_slice < length
|
|
auto ltLength = CMPI(ult, newCrd, lvlSizes[tid][lvl]);
|
|
conds.push_back(ltLength);
|
|
|
|
// Third, rem == 0 (skip the check if stride is known to be 1).
|
|
if (auto staticStride = enc.getStaticLvlSliceStride(lvl);
|
|
!(staticStride.has_value() && *staticStride == 1)) {
|
|
auto fitStride = CMPI(eq, crdRem, C_IDX(0));
|
|
conds.push_back(fitStride);
|
|
}
|
|
|
|
// Must meet all condition to be a valid coordinate in slice.
|
|
auto pred = conds.front();
|
|
for (auto cond : ValueRange(conds).drop_front())
|
|
pred = ANDI(pred, cond);
|
|
|
|
return {newCrd, pred};
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Sparse tensor loop emitter class implementations
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level lvl, Value crd) {
|
|
Value pos = lvl == 0 ? C_IDX(0) : posits[tid][lvl - 1];
|
|
Value mul = MULI(highs[tid][lvl], pos);
|
|
if (isSparseSlices[tid])
|
|
crd = toSliceCrd(builder, loc, crd, sliceOffsets[tid][lvl],
|
|
sliceStrides[tid][lvl], tensors[tid], lvl);
|
|
Value add = ADDI(mul, crd);
|
|
return add;
|
|
}
|
|
|
|
Value LoopEmitter::genSegmentHigh(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl, Value pLo,
|
|
Value pHi) {
|
|
const auto coordinates = coordinatesBuffers[tid][lvl];
|
|
const auto sameCrd = genIndexLoad(builder, loc, coordinates, pLo);
|
|
auto whileOp = builder.create<scf::WhileOp>(
|
|
loc, builder.getIndexType(), pLo,
|
|
/*beforeBuilder=*/
|
|
[pHi, coordinates, sameCrd](OpBuilder &builder, Location loc,
|
|
ValueRange ivs) {
|
|
const auto pos = ivs[0];
|
|
Value inBound = builder.create<arith::CmpIOp>(
|
|
loc, arith::CmpIPredicate::ult, pos, pHi);
|
|
auto ifInBound =
|
|
builder.create<scf::IfOp>(loc, builder.getI1Type(), inBound, true);
|
|
{
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// Load the next coordinates only when inbound (to avoid OOB
|
|
// acccesses).
|
|
builder.setInsertionPointToStart(ifInBound.thenBlock());
|
|
Value crd = genIndexLoad(builder, loc, coordinates, pos);
|
|
Value isSameCrd = builder.create<arith::CmpIOp>(
|
|
loc, arith::CmpIPredicate::eq, crd, sameCrd);
|
|
YIELD(isSameCrd);
|
|
// Else, the position is out of bound, yield false to terminate the
|
|
// loop.
|
|
builder.setInsertionPointToStart(ifInBound.elseBlock());
|
|
YIELD(constantI1(builder, loc, false));
|
|
}
|
|
builder.create<scf::ConditionOp>(loc, ifInBound.getResults()[0], ivs);
|
|
},
|
|
/*afterBuilder=*/
|
|
[](OpBuilder &builder, Location loc, ValueRange ivs) {
|
|
// pos ++
|
|
Value nextPos = ADDI(ivs[0], C_IDX(1));
|
|
YIELD(nextPos);
|
|
});
|
|
// Return the segment high.
|
|
return whileOp.getResult(0);
|
|
}
|
|
|
|
Value LoopEmitter::genSparseCrd(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level dstLvl) {
|
|
Value crd = C_IDX(0);
|
|
const auto reassoc = getCollapseReassociation(tid, dstLvl);
|
|
const unsigned reassocSize = reassoc.size();
|
|
for (unsigned i = 0; i < reassocSize; i++) {
|
|
const Level srcLvl = reassoc[i];
|
|
// A load on the coordinates array yields the coordinate.
|
|
const Value mem = coordinatesBuffers[tid][srcLvl];
|
|
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
|
|
const Value pos = posits[tid][dstLvl];
|
|
const Value off = genIndexLoad(builder, loc, mem, pos);
|
|
// Linearized the coordinates within the same collapse reassociation.
|
|
crd = ADDI(crd, off);
|
|
if (i != reassocSize - 1) {
|
|
crd = MULI(crd, this->lvlSizes[tid][reassoc[i + 1]]);
|
|
}
|
|
}
|
|
return crd;
|
|
}
|
|
|
|
LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput,
|
|
bool isSparseOut, ArrayRef<LoopId> topSort,
|
|
DependentLvlGetter dimGetter) {
|
|
initialize(tensors, loopTag, hasOutput, isSparseOut, topSort, dimGetter);
|
|
}
|
|
|
|
void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
|
|
bool isSparseOut, ArrayRef<LoopId> topSort,
|
|
DependentLvlGetter dimGetter) {
|
|
// First initialize the top-level type of the fields.
|
|
this->loopTag = loopTag;
|
|
this->hasOutput = hasOutput;
|
|
this->isSparseOut = isSparseOut;
|
|
|
|
const unsigned numTensors = ts.size();
|
|
this->tensors.assign(ts.begin(), ts.end());
|
|
this->lvlTypes.assign(numTensors, std::vector<DimLevelType>());
|
|
this->lvlSizes.assign(numTensors, std::vector<Value>());
|
|
this->highs.assign(numTensors, std::vector<Value>());
|
|
this->segHi.assign(numTensors, std::vector<Value>());
|
|
this->posits.assign(numTensors, std::vector<Value>());
|
|
this->coords.assign(numTensors, std::vector<Value>());
|
|
this->positionsBuffers.assign(numTensors, std::vector<Value>());
|
|
this->coordinatesBuffers.assign(numTensors, std::vector<Value>());
|
|
this->valBuffer.assign(numTensors, nullptr);
|
|
this->collapseReassoc.assign(numTensors, nullptr);
|
|
this->isSparseSlices.assign(numTensors, false);
|
|
this->sliceOffsets.assign(numTensors, std::vector<Value>());
|
|
this->sliceStrides.assign(numTensors, std::vector<Value>());
|
|
|
|
const LoopOrd numLoops = topSort.size();
|
|
// These zeros will be overwritten below, but we need to initialize
|
|
// them to something since we'll need random-access assignment.
|
|
this->loopIdToOrd.assign(numLoops, 0);
|
|
this->loopStack.reserve(numLoops);
|
|
this->loopSeqStack.reserve(numLoops);
|
|
|
|
// Index-reduction related fields.
|
|
this->dependentLvlMap.assign(
|
|
numTensors, std::vector<std::vector<std::pair<TensorId, Level>>>());
|
|
this->slicePosBuffer.assign(numTensors, std::vector<std::vector<Value>>());
|
|
this->sliceSizes.assign(numTensors, std::vector<std::vector<Value>>());
|
|
this->sliceStack.assign(numTensors, std::vector<SliceInfo>());
|
|
this->levelReducedDep.assign(numTensors, std::vector<unsigned>());
|
|
|
|
// Initialize nested types of `TensorId`-indexed fields.
|
|
for (TensorId tid = 0; tid < numTensors; tid++) {
|
|
const Value t = tensors[tid];
|
|
// a scalar or 0-dimension tensors
|
|
if (isZeroRankedTensorOrScalar(t.getType()))
|
|
continue;
|
|
|
|
auto rtp = getRankedTensorType(t);
|
|
if (auto reshape = t.getDefiningOp<tensor::CollapseShapeOp>();
|
|
isUniqueCOOType(rtp) && reshape) {
|
|
// TODO: Supports more kinds of sparse tensors.
|
|
// FIXME: We should instead lower reshape operations on sparse tensors to
|
|
// view change.
|
|
collapseReassoc[tid] = reshape.getReassociation();
|
|
rtp = reshape.getSrcType();
|
|
// Overwrites the tensor to the source tensor of reshape operations.
|
|
tensors[tid] = reshape.getSrc();
|
|
}
|
|
const SparseTensorType stt(rtp);
|
|
const Level lvlRank = stt.getLvlRank();
|
|
// We always treat sparse output tensor as dense so that we always iterate
|
|
// it based on lvl size.
|
|
if (stt.hasEncoding() && !(isOutputTensor(tid) && isSparseOut)) {
|
|
const auto enc = stt.getEncoding();
|
|
isSparseSlices[tid] = enc.isSlice();
|
|
for (auto lvlTp : enc.getDimLevelType())
|
|
lvlTypes[tid].push_back(lvlTp);
|
|
} else {
|
|
lvlTypes[tid].assign(lvlRank, DimLevelType::Dense);
|
|
}
|
|
|
|
// Initialize using empty value.
|
|
lvlSizes[tid].assign(lvlRank, Value());
|
|
highs[tid].assign(lvlRank, Value());
|
|
segHi[tid].assign(lvlRank, Value());
|
|
posits[tid].assign(lvlRank, Value());
|
|
coords[tid].assign(lvlRank, Value());
|
|
positionsBuffers[tid].assign(lvlRank, Value());
|
|
coordinatesBuffers[tid].assign(lvlRank, Value());
|
|
sliceOffsets[tid].assign(lvlRank, Value());
|
|
sliceStrides[tid].assign(lvlRank, Value());
|
|
|
|
// Slice-driven loops related initialization.
|
|
levelReducedDep[tid].assign(lvlRank, 0);
|
|
dependentLvlMap[tid].assign(lvlRank,
|
|
std::vector<std::pair<TensorId, Level>>());
|
|
slicePosBuffer[tid].assign(lvlRank, std::vector<Value>());
|
|
sliceSizes[tid].assign(lvlRank, std::vector<Value>());
|
|
sliceStack[tid].emplace_back(/*minCrd=*/Value(),
|
|
/*offset=*/Value(), /*isNonEmpty*/ Value(),
|
|
std::nullopt, 0);
|
|
if (dimGetter) {
|
|
auto reassoc = collapseReassoc[tid];
|
|
Level dstRank = reassoc ? reassoc.size() : lvlRank;
|
|
for (Level l = 0; l < dstRank; l++) {
|
|
dependentLvlMap[tid][l] = dimGetter(tid, l);
|
|
unsigned depends = dependentLvlMap[tid][l].size();
|
|
if (depends == 0)
|
|
continue;
|
|
// TODO: View-base collapse and dependent index reduction are not
|
|
// compatible right now.
|
|
assert(!reassoc);
|
|
// We need `depends - 1` slices to fully the affine expression.
|
|
sliceSizes[tid][l].assign(depends - 1, nullptr);
|
|
slicePosBuffer[tid][l].assign(depends - 1, nullptr);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Construct the inverse of the `topSort` from the sparsifier.
|
|
// This is needed to map `AffineDimExpr`s back to the `LoopOrd`
|
|
// used in loop emitter.
|
|
// FIXME: This map should be maintained outside loop emitter.
|
|
for (LoopOrd n = 0; n < numLoops; n++)
|
|
loopIdToOrd[topSort[n]] = n;
|
|
}
|
|
|
|
void LoopEmitter::initializeLoopEmit(OpBuilder &builder, Location loc,
|
|
LoopEmitter::OutputUpdater updater) {
|
|
// For every tensor:
|
|
// * get the values buffer.
|
|
// * For every level:
|
|
// * get the positions and coordinates buffers
|
|
// * get/compute the level-size, which is also used as the upper-bound
|
|
// on positions.
|
|
for (TensorId t = 0, numTensors = getNumTensors(); t < numTensors; t++) {
|
|
const Value tensor = tensors[t];
|
|
const auto rtp = dyn_cast<RankedTensorType>(tensor.getType());
|
|
if (!rtp)
|
|
// Skips only scalar, zero ranked tensor still need to be bufferized and
|
|
// (probably) filled with zeros by users.
|
|
continue;
|
|
// FIXME: the definition of `lvlRank` looks more like a dim-rank;
|
|
// but the variable is used as a level everywhere below, which
|
|
// suggests there may be some dim/lvl confusion going on here.
|
|
const Level lvlRank = rtp.getRank();
|
|
const auto shape = rtp.getShape();
|
|
const auto enc = getSparseTensorEncoding(rtp);
|
|
const Level cooStart = enc ? getCOOStart(enc) : lvlRank;
|
|
// Scan all levels of current tensor.
|
|
for (Level l = 0; l < lvlRank; l++) {
|
|
// This should be called only once at beginning.
|
|
assert(!positionsBuffers[t][l] && !coordinatesBuffers[t][l] &&
|
|
!highs[t][l]);
|
|
const auto lvlTp = lvlTypes[t][l];
|
|
// Handle sparse storage schemes.
|
|
if (isCompressedDLT(lvlTp) || isCompressedWithHiDLT(lvlTp)) {
|
|
// Generate sparse primitives to obtain positions and coordinates.
|
|
positionsBuffers[t][l] = genToPositions(builder, loc, tensor, l);
|
|
coordinatesBuffers[t][l] =
|
|
genToCoordinates(builder, loc, tensor, l, cooStart);
|
|
} else if (isSingletonDLT(lvlTp)) {
|
|
// Singleton level, fetch coordinates.
|
|
coordinatesBuffers[t][l] =
|
|
genToCoordinates(builder, loc, tensor, l, cooStart);
|
|
} else {
|
|
// Dense level, nothing to fetch.
|
|
assert(isDenseDLT(lvlTp));
|
|
}
|
|
|
|
// FIXME: `toOrigDim` is deprecated. For now this relies on the
|
|
// 1:1 mapping between levels and dimensions, since nowhere else
|
|
// in the code supports HigherOrdering yet either.
|
|
Value lvlSz = mlir::linalg::createOrFoldDimOp(builder, loc, tensor,
|
|
toOrigDim(enc, l));
|
|
// Find upper bound in current dimension.
|
|
highs[t][l] = lvlSizes[t][l] = lvlSz;
|
|
if (isSparseSlices[t]) {
|
|
sliceOffsets[t][l] = genSliceOffset(builder, loc, tensors[t], l);
|
|
sliceStrides[t][l] = genSliceStride(builder, loc, tensors[t], l);
|
|
}
|
|
}
|
|
|
|
// Perform the required bufferization. Dense inputs materialize
|
|
// from the input tensors. Sparse inputs use sparse primitives to obtain the
|
|
// values.
|
|
// Delegates extra output initialization to clients.
|
|
bool isOutput = isOutputTensor(t);
|
|
Type elementType = rtp.getElementType();
|
|
if (!enc) {
|
|
// Non-annotated dense tensors.
|
|
BaseMemRefType denseTp = MemRefType::get(shape, elementType);
|
|
|
|
// TODO: if we unconditionally use fully dynamic layout here, it breaks
|
|
// some vectorization passes which requires static stride = 1.
|
|
// Is it possible to call vectorization pass after bufferization?
|
|
if (llvm::isa_and_nonnull<tensor::ExtractSliceOp>(tensor.getDefiningOp()))
|
|
denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp);
|
|
|
|
Value denseVal =
|
|
builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
|
|
// Dense outputs need special handling.
|
|
if (isOutput && updater)
|
|
denseVal = updater(builder, loc, denseVal, tensor);
|
|
|
|
valBuffer[t] = denseVal;
|
|
} else {
|
|
// Annotated sparse tensors.
|
|
// We also need the value buffer for all-dense annotated "sparse" tensors.
|
|
valBuffer[t] = genToValues(builder, loc, tensor);
|
|
}
|
|
// NOTE: we can also prepare for 0 lvl here in advance, this will hoist
|
|
// some loop preparation from tensor iteration, but will also (undesirably)
|
|
// hoist the code ouside if-conditions.
|
|
}
|
|
|
|
Type indexType = builder.getIndexType();
|
|
Value c0 = constantZero(builder, loc, indexType);
|
|
for (TensorId t = 0, e = tensors.size(); t < e; t++) {
|
|
auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType());
|
|
if (!rtp)
|
|
continue;
|
|
|
|
Level lvlRank = SparseTensorType(rtp).getLvlRank();
|
|
for (Level lvl = 0; lvl < lvlRank; lvl++) {
|
|
if (!dependentLvlMap[t][lvl].empty()) {
|
|
ArrayRef<std::pair<TensorId, Level>> depLvls = dependentLvlMap[t][lvl];
|
|
// Needs at least two operands to form a non-trivial affine expression.
|
|
assert(depLvls.size() > 1);
|
|
|
|
Value size = c0;
|
|
for (unsigned e = depLvls.size() - 1; e >= 1; e--) {
|
|
auto [dt, dd] = depLvls[e];
|
|
size = ADDI(size, lvlSizes[dt][dd]);
|
|
sliceSizes[t][lvl][e - 1] = size;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
localInsertPos = builder.getInsertionPoint()->getPrevNode();
|
|
}
|
|
|
|
void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc,
|
|
ArrayRef<TensorLevel> tidLvls) {
|
|
// TODO: sort
|
|
assert(loopSeqStack.size() == loopStack.size());
|
|
// Prepares for all the tensors used in the current loop sequence.
|
|
std::vector<std::tuple<TensorId, Level, bool>> slicedTids;
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
if (!dependentLvlMap[tid][lvl].empty()) {
|
|
bool fullyRed = genSliceBegin(builder, loc, tid, lvl);
|
|
slicedTids.emplace_back(tid, lvl, fullyRed);
|
|
} else {
|
|
prepareLoopOverTensorAtLvl(builder, loc, tid, lvl);
|
|
}
|
|
}
|
|
|
|
// Universal Index starts from 0.
|
|
loopSeqStack.emplace_back(C_IDX(0), std::move(slicedTids));
|
|
}
|
|
|
|
void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) {
|
|
assert(loopSeqStack.size() == loopStack.size() + 1);
|
|
|
|
const auto &slicedTids = loopSeqStack.back().second;
|
|
|
|
// Depending on whether the slice is resolved or not at current loop sequence,
|
|
// end them in different ways.
|
|
for (auto [tid, lvl, res] : slicedTids) {
|
|
if (!res) {
|
|
// If this is a unresolved-slice-driven loop, pops out the slice.
|
|
assert(sliceStack[tid].back().slicedOnLvl == lvl);
|
|
sliceStack[tid].pop_back();
|
|
} else {
|
|
if (!isDenseDLT(lvlTypes[tid][lvl])) {
|
|
// Else this is a resolved-slice, and advance posit similar to TACO.
|
|
Value c1 = C_IDX(1), c2 = C_IDX(2);
|
|
// pIdx += 2, we finished the current lvl, advance the pointer index of
|
|
// the previous level by two to skip the [pLo, pHi] for current level.
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl].back();
|
|
Value curP = genIndexLoad(builder, loc, sPtrBuf, c1);
|
|
// TODO: we could probably use an SSA value for it.
|
|
Value nexP = ADDI(curP, c2);
|
|
builder.create<memref::StoreOp>(loc, nexP, sPtrBuf, c1);
|
|
}
|
|
}
|
|
}
|
|
loopSeqStack.pop_back();
|
|
}
|
|
|
|
Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) {
|
|
switch (a.getKind()) {
|
|
case AffineExprKind::DimId: {
|
|
// FIXME: since the one callsite in Sparsification passes in a
|
|
// level-expression, the `getPosition` must in fact be a `Dimension`.
|
|
// However, elsewhere we have been lead to expect that `loopIdToOrd`
|
|
// should be indexed by `LoopId`...
|
|
const auto loopId = a.cast<AffineDimExpr>().getPosition();
|
|
assert(loopId < loopIdToOrd.size());
|
|
return loopStack[loopIdToOrd[loopId]].iv;
|
|
}
|
|
case AffineExprKind::Add: {
|
|
auto binOp = a.cast<AffineBinaryOpExpr>();
|
|
return ADDI(genAffine(builder, loc, binOp.getLHS()),
|
|
genAffine(builder, loc, binOp.getRHS()));
|
|
}
|
|
case AffineExprKind::Mul: {
|
|
auto binOp = a.cast<AffineBinaryOpExpr>();
|
|
return MULI(genAffine(builder, loc, binOp.getLHS()),
|
|
genAffine(builder, loc, binOp.getRHS()));
|
|
}
|
|
case AffineExprKind::Constant: {
|
|
int64_t c = a.cast<AffineConstantExpr>().getValue();
|
|
return C_IDX(c);
|
|
}
|
|
default:
|
|
llvm_unreachable("unexpected affine subscript");
|
|
}
|
|
}
|
|
|
|
Operation *LoopEmitter::emitForLoopOverTensorAtLvl(
|
|
OpBuilder &builder, Location loc, TensorId tid, Level dstLvl, Value lo,
|
|
Value hi, MutableArrayRef<Value> reduc, bool isParallel) {
|
|
bool isSparseCond = isCompressedDLT(lvlTypes[tid][dstLvl]) ||
|
|
isCompressedWithHiDLT(lvlTypes[tid][dstLvl]) ||
|
|
isSingletonDLT(lvlTypes[tid][dstLvl]);
|
|
|
|
const auto reassoc = getCollapseReassociation(tid, dstLvl);
|
|
// TODO: support dynamic slices.
|
|
// Uses the first dimension here to build the loop bound (which is also the
|
|
// biggest range).
|
|
const Level srcLvl = reassoc.front();
|
|
Value step = C_IDX(1);
|
|
|
|
Operation *loop = nullptr;
|
|
Value iv;
|
|
if (isParallel) {
|
|
assert(collapseReassoc[tid] == nullptr);
|
|
scf::ParallelOp parOp =
|
|
builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
|
|
builder.setInsertionPointToStart(parOp.getBody());
|
|
assert(parOp.getNumReductions() == reduc.size());
|
|
iv = parOp.getInductionVars()[0];
|
|
|
|
// In-place update on the reduction variable vector.
|
|
// Note that the init vals is not the actual reduction variables but instead
|
|
// used as a "special handle" to (temporarily) represent them. The
|
|
// expression on init vals will be moved into scf.reduce and replaced with
|
|
// the block arguments when exiting the loop (see exitForLoop). This is
|
|
// needed as we can not build the actual reduction block and get the actual
|
|
// reduction varaible before users fill parallel loop body.
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = parOp.getInitVals()[i];
|
|
loop = parOp;
|
|
} else {
|
|
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
|
|
builder.setInsertionPointToStart(forOp.getBody());
|
|
iv = forOp.getInductionVar();
|
|
|
|
// In-place update on the reduction variable vector.
|
|
assert(forOp.getNumRegionIterArgs() == reduc.size());
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = forOp.getRegionIterArg(i);
|
|
loop = forOp;
|
|
}
|
|
assert(loop && iv);
|
|
|
|
Value crd;
|
|
if (isSparseCond) {
|
|
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
|
|
// For COO, the position is the same across consecutive levels.
|
|
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
|
|
llvm::for_each(reassoc,
|
|
[this, tid, iv](Level srcLvl) { posits[tid][srcLvl] = iv; });
|
|
crd = genSparseCrd(builder, loc, tid, dstLvl);
|
|
} else {
|
|
// Dense tensor, the coordinate is the inducation variable.
|
|
crd = iv;
|
|
}
|
|
|
|
if (isSparseSlices[tid] && isSparseCond) {
|
|
// For sparse level slices, we need to filter out invalid coordinates that
|
|
// are not included in the slice.
|
|
SmallVector<Type> types;
|
|
for (Value red : reduc)
|
|
types.push_back(red.getType());
|
|
|
|
auto [trans, pred] = genSliceLegitPredicate(builder, loc, crd, tid, srcLvl);
|
|
bool hasReduc = !types.empty();
|
|
scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, pred,
|
|
/*else*/ hasReduc);
|
|
if (hasReduc) {
|
|
// scf.for (a) -> v
|
|
// %s = scf.if (a) -> v
|
|
// user-generated code.
|
|
// else
|
|
// yield a
|
|
// yield %s
|
|
YIELD(ifOp.getResults());
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
// On mismatch.
|
|
YIELD(reduc);
|
|
}
|
|
// Set the insertion point to matched branch.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
crd = trans;
|
|
}
|
|
|
|
assert(crd);
|
|
coords[tid][dstLvl] = crd;
|
|
return loop;
|
|
}
|
|
|
|
Operation *LoopEmitter::emitWhileLoopOverSliceAtSparseLvl(
|
|
OpBuilder &builder, Location loc, Value pLo, Value pHi, Value offset,
|
|
Value sliceSize, TensorId tid, Level lvl, MutableArrayRef<Value> reduc) {
|
|
// TODO: we should generalize the method to support iteration over for
|
|
// normal slices as well to allow early break.
|
|
Operation *insertPoint = nullptr;
|
|
Operation *loop =
|
|
genSliceLvlTraverseLoop(
|
|
builder, loc, pLo, pHi, offset, sliceSize, tid, lvl, reduc,
|
|
/*genYield=*/false, // unaware of the yield values from user yet
|
|
[this, tid, lvl, reduc, offset,
|
|
&insertPoint](OpBuilder &builder, Location loc, Value iv,
|
|
MutableArrayRef<Value> innerReduc) {
|
|
assert(innerReduc.size() == reduc.size());
|
|
// Updates users' reduction variable inplace
|
|
for (unsigned i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = innerReduc[i];
|
|
// Loads the coordinates.
|
|
Value absC =
|
|
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], iv);
|
|
|
|
// We need to substract the offset to get relative coordinates.
|
|
// TODO: how to assert relC >=0 during runtime?
|
|
insertPoint = builder.create<arith::SubIOp>(loc, absC, offset);
|
|
posits[tid][lvl] = iv;
|
|
coords[tid][lvl] = insertPoint->getResult(0);
|
|
})
|
|
.first;
|
|
// Sets the insertionn pointer inside loop body.
|
|
builder.setInsertionPointAfter(insertPoint);
|
|
return loop;
|
|
}
|
|
|
|
Operation *LoopEmitter::enterLoopOverTensorAtLvl(OpBuilder &builder,
|
|
Location loc,
|
|
ArrayRef<TensorLevel> tidLvls,
|
|
MutableArrayRef<Value> reduc,
|
|
bool isParallel) {
|
|
// TODO: support multiple return on parallel for?
|
|
assert(!isParallel || reduc.size() <= 1);
|
|
bool isSparseCond = false, isSparseSliceCond = false;
|
|
auto [tid, lvl] = unpackTensorLevel(tidLvls.front());
|
|
|
|
// Finds out the tensor level that we should use to generate loops. Amongs all
|
|
// the tensor levels, there is at most one sparse tensor level.
|
|
for (auto [t, l] : unpackTensorLevelRange(tidLvls)) {
|
|
assert(lvlTypes[t].size() > l); // Must be a valid tid, dim pair
|
|
assert(!coords[t][l] || // We cannot re-enter the same level
|
|
!dependentLvlMap[t][l].empty()); // unless it is a slice-driver loop
|
|
auto lvlType = lvlTypes[t][l];
|
|
// Must be a recognizable DLT.
|
|
assert(isDenseDLT(lvlType) || isCompressedDLT(lvlType) ||
|
|
isCompressedWithHiDLT(lvlType) || isSingletonDLT(lvlType));
|
|
|
|
// This is a slice-driven loop on sparse level.
|
|
if (!dependentLvlMap[t][l].empty() && !isDenseDLT(lvlType)) {
|
|
assert(!isSparseSliceCond && !isSparseCond);
|
|
isSparseSliceCond = true;
|
|
tid = t;
|
|
lvl = l;
|
|
continue;
|
|
}
|
|
|
|
bool isSparse = isCompressedDLT(lvlType) || isSingletonDLT(lvlType) ||
|
|
isCompressedWithHiDLT(lvlType);
|
|
// We can at most have one sparse input, otherwise, a while loop is
|
|
// required to co-iterate multiple sparse tensors.
|
|
assert(!isSparseCond || !isSparse);
|
|
assert(!isSparseSliceCond || !isSparseCond);
|
|
if (isSparse) {
|
|
tid = t;
|
|
lvl = l;
|
|
}
|
|
isSparseCond = isSparseCond || isSparse;
|
|
}
|
|
|
|
DimLevelType lvlType = lvlTypes[tid][lvl];
|
|
// TODO: Dense slice driven loop can be generated using for loop as well.
|
|
assert(!isSparseSliceCond || !isDenseDLT(lvlType));
|
|
bool isDenseSliceCond =
|
|
isDenseDLT(lvlType) && !dependentLvlMap[tid][lvl].empty();
|
|
// if the slice is fully reduced, we can now use TACO-based algorithm to
|
|
// iterate it.
|
|
|
|
Operation *l = nullptr;
|
|
|
|
// At most one tensor used as condition in for loop;
|
|
SmallVector<TensorLevel, 1> condTidLvl;
|
|
// There might be multiple dense slice driven tensor.
|
|
SmallVector<SliceLoopInfo> sliceDrivenInfo;
|
|
|
|
// Generates loops differently depending on whether we need a slice-driven
|
|
// loop or a simple level traversal loop.
|
|
if (isSparseSliceCond) {
|
|
bool fullyReduced = depFullyReduced(tid, lvl);
|
|
if (!fullyReduced) {
|
|
l = emitSliceDrivenLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc);
|
|
} else {
|
|
// If the slice is fully reduced, we can now use TACO-based algorithm to
|
|
// iterate it.
|
|
l = emitWhileLoopOverSliceAtSparseLvl(
|
|
builder, loc, posits[tid][lvl], highs[tid][lvl],
|
|
getFinalSliceOnLvl(tid, lvl).offset, sliceSizes[tid][lvl].back(), tid,
|
|
lvl, reduc);
|
|
}
|
|
levelReducedDep[tid][lvl]++;
|
|
sliceDrivenInfo.emplace_back(tid, lvl, fullyReduced);
|
|
} else {
|
|
Value lo = isSparseCond ? posits[tid][lvl] // current offset
|
|
: loopSeqStack.back().first; // universal index
|
|
Value hi = highs[tid][lvl];
|
|
if (isDenseSliceCond) {
|
|
bool fullyReduced = depFullyReduced(tid, lvl);
|
|
Value sliceSz = sliceSizes[tid][lvl][sliceStack[tid].back().depth - 1];
|
|
// Adjust for loop hi for dense slice-driven loop.
|
|
if (fullyReduced) {
|
|
hi = sliceSz;
|
|
condTidLvl.push_back(makeTensorLevel(tid, lvl));
|
|
} else {
|
|
hi = SUBI(lvlSizes[tid][lvl], sliceSz);
|
|
hi = ADDI(hi, C_IDX(1));
|
|
}
|
|
} else {
|
|
condTidLvl.push_back(makeTensorLevel(tid, lvl));
|
|
}
|
|
l = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, lo, hi, reduc,
|
|
isParallel);
|
|
}
|
|
Value iv = coords[tid][lvl];
|
|
for (auto [t, l] : unpackTensorLevelRange(tidLvls)) {
|
|
// We only need to handle slice-driven loops on dense level here.
|
|
// If it is a slice-driven loop on sparse level, it needs a while loop to
|
|
// insert break statements, and it must have been handled correctly in L692.
|
|
if (!dependentLvlMap[t][l].empty() && isDenseDLT(lvlTypes[t][l])) {
|
|
// Pushes sliced levels to build correct LoopInfo.
|
|
bool fullyReduc = depFullyReduced(t, l);
|
|
SliceInfo &info = sliceStack[t].back();
|
|
if (fullyReduc) {
|
|
posits[t][l] = genAddress(builder, loc, t, l, ADDI(info.offset, iv));
|
|
} else {
|
|
// Puts sliced dense loop into LoopInfo so that LoopEmitter knows how to
|
|
// exit it.
|
|
sliceDrivenInfo.emplace_back(t, l, fullyReduc);
|
|
// Update the slice information as we enter the new loop.
|
|
assert(*info.slicedOnLvl == l);
|
|
info.minCrd = info.offset = iv;
|
|
info.isNonEmpty = constantI1(builder, loc, true);
|
|
levelReducedDep[t][l]++;
|
|
}
|
|
}
|
|
}
|
|
// NOTE: we can also prepare for next dim here in advance
|
|
// Pushes the loop into stack.
|
|
loopStack.emplace_back(condTidLvl, sliceDrivenInfo, l,
|
|
builder.getInsertionBlock(), iv, loopTag);
|
|
// Emit extra locals.
|
|
emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tidLvls);
|
|
return l;
|
|
}
|
|
|
|
Operation *LoopEmitter::enterFilterLoopOverTensorAtLvl(
|
|
OpBuilder &builder, Location loc, TensorId tid, Level lvl,
|
|
AffineExpr affine, MutableArrayRef<Value> reduc) {
|
|
assert(isValidLevel(tid, lvl));
|
|
assert(!affine.isa<AffineDimExpr>() && !isDenseDLT(lvlTypes[tid][lvl]));
|
|
// We can not re-enter the same level.
|
|
assert(!coords[tid][lvl]);
|
|
|
|
// TODO: We should instead use a whileOp for filter loop to allow early
|
|
// break when exceeding (for ordered levels).
|
|
// TODO: There are many other potiential opportunities that we might apply in
|
|
// the future. E.g., we could use binary search to locate positions.
|
|
const Value step = C_IDX(1);
|
|
const Value pLo = posits[tid][lvl];
|
|
const Value pHi = highs[tid][lvl];
|
|
scf::ForOp forOp = builder.create<scf::ForOp>(loc, pLo, pHi, step, reduc);
|
|
|
|
// In-place update on the reduction variable vector.
|
|
assert(forOp.getNumRegionIterArgs() == reduc.size());
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = forOp.getRegionIterArg(i);
|
|
|
|
builder.setInsertionPointToStart(forOp.getBody());
|
|
// The induction variable gives the position.
|
|
const Value pos = forOp.getInductionVar();
|
|
posits[tid][lvl] = pos;
|
|
// Generating a load on the coordinates array yields the crd.
|
|
const Value mem = coordinatesBuffers[tid][lvl];
|
|
const Value crd = genIndexLoad(builder, loc, mem, pos);
|
|
coords[tid][lvl] = crd;
|
|
|
|
// Generate an if-condition to filter out coordinates that are not
|
|
// equal to the result of the affine expression.
|
|
Value expected = genAffine(builder, loc, affine);
|
|
auto pred = CMPI(eq, crd, expected);
|
|
SmallVector<Type> types;
|
|
for (Value red : reduc) {
|
|
types.push_back(red.getType());
|
|
}
|
|
|
|
bool hasReduc = !types.empty();
|
|
scf::IfOp ifOp =
|
|
builder.create<scf::IfOp>(loc, types, pred, /*else*/ hasReduc);
|
|
if (hasReduc) {
|
|
// scf.for (a) -> v
|
|
// %s = scf.if (a) -> v
|
|
// user-generated code.
|
|
// else
|
|
// yield a
|
|
// yield %s
|
|
YIELD(ifOp.getResults());
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
// On mismatch.
|
|
YIELD(reduc);
|
|
}
|
|
// Set the insert point to matched branch.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
|
|
// NOTE: we can also prepare for next lvl here in advance
|
|
// Push the loop into stack
|
|
loopStack.emplace_back(ArrayRef<TensorLevel>(makeTensorLevel(tid, lvl)),
|
|
ArrayRef<SliceLoopInfo>(), forOp,
|
|
builder.getInsertionBlock(), coords[tid][lvl],
|
|
nullptr);
|
|
return forOp;
|
|
}
|
|
|
|
void LoopEmitter::genDenseAffineAddress(OpBuilder &builder, Location loc,
|
|
TensorLevel tidLvl,
|
|
AffineExpr lvlExpr) {
|
|
auto [tid, lvl] = unpackTensorLevel(tidLvl);
|
|
assert(isDenseDLT(lvlTypes[tid][lvl]));
|
|
// For dense levels, the level-coordinate also serves as the position.
|
|
Value lvlCrd = genAffine(builder, loc, lvlExpr);
|
|
posits[tid][lvl] = genAddress(builder, loc, tid, lvl, lvlCrd);
|
|
}
|
|
|
|
Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls(
|
|
OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls,
|
|
bool needsUniv, MutableArrayRef<Value> reduc) {
|
|
// NOTE: the slice driven tensor-related reduction variable must
|
|
// appear before normal tensors.
|
|
SmallVector<Type> types;
|
|
SmallVector<Value> operands;
|
|
// Construct the while-loop with a parameter for each coordinate.
|
|
const Type indexType = builder.getIndexType();
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
// TODO: support coiteration with slice driven tensors.
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
assert(dependentLvlMap[tid][lvl].empty() && "TODO: not yet implemented");
|
|
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
|
|
isCompressedWithHiDLT(lvlTp)) {
|
|
const auto reassoc = getCollapseReassociation(tid, lvl);
|
|
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
|
|
if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) {
|
|
// This is the segment high for each non-unique levels.
|
|
types.push_back(indexType);
|
|
operands.push_back(C_IDX(0));
|
|
}
|
|
}
|
|
const auto pos = posits[tid][reassoc.front()];
|
|
assert(pos);
|
|
types.push_back(indexType);
|
|
operands.push_back(pos);
|
|
}
|
|
}
|
|
// The position where user-supplied reduction variable starts.
|
|
for (Value rec : reduc) {
|
|
types.push_back(rec.getType());
|
|
operands.push_back(rec);
|
|
}
|
|
if (needsUniv) {
|
|
types.push_back(indexType);
|
|
// Update universal index.
|
|
operands.push_back(loopSeqStack.back().first);
|
|
}
|
|
assert(types.size() == operands.size());
|
|
scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
|
|
|
|
SmallVector<Location> locs(types.size(), loc);
|
|
Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
|
|
Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
|
|
|
|
// Build the "before" region, which effectively consists
|
|
// of a conjunction of "i < upper" tests on all induction.
|
|
builder.setInsertionPointToStart(&whileOp.getBefore().front());
|
|
Value cond;
|
|
unsigned o = 0;
|
|
for (auto [t, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
const TensorId tid = t; // Why `t` can not be captured by lambda?
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
|
|
isCompressedWithHiDLT(lvlTp)) {
|
|
const auto reassoc = getCollapseReassociation(tid, lvl);
|
|
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
|
|
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
|
|
if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) {
|
|
// Links the SSA chain for segHi.
|
|
segHi[tid][reassoc[i]] = after->getArgument(o++);
|
|
}
|
|
}
|
|
Value op1 = before->getArgument(o);
|
|
// We used the first level bound as the bound the collapsed set of levels.
|
|
Value op2 = highs[tid][reassoc.front()];
|
|
Value opc = CMPI(ult, op1, op2);
|
|
cond = cond ? ANDI(cond, opc) : opc;
|
|
// Update positions
|
|
Value pos = after->getArgument(o++);
|
|
// For COO, the position is the same across consecutive levels.
|
|
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
|
|
llvm::for_each(reassoc, [this, tid, pos](Level srcLvl) {
|
|
posits[tid][srcLvl] = pos;
|
|
});
|
|
}
|
|
}
|
|
builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
|
|
|
|
// Generates while body.
|
|
builder.setInsertionPointToStart(&whileOp.getAfter().front());
|
|
|
|
SmallVector<std::pair<Value, unsigned>> slicesPreds;
|
|
unsigned i = 0;
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
// Prepares for next level.
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
|
|
isCompressedWithHiDLT(lvlTp)) {
|
|
coords[tid][lvl] = genSparseCrd(builder, loc, tid, lvl);
|
|
if (isSparseSlices[tid]) {
|
|
auto [trans, pred] =
|
|
genSliceLegitPredicate(builder, loc, coords[tid][lvl], tid, lvl);
|
|
slicesPreds.emplace_back(pred, i);
|
|
// Updates to the relative coordinate to the slice.
|
|
coords[tid][lvl] = trans;
|
|
}
|
|
i++;
|
|
}
|
|
}
|
|
|
|
if (!slicesPreds.empty()) {
|
|
// Skips invalid loop iteration when slice coordinate is inapplicable.
|
|
SmallVector<Value> yields(after->getArguments());
|
|
// Generates a list of if statments
|
|
// pos = in_slice ? pos : pos + 1
|
|
// TODO: instead of always picking pos + 1, we should set pos = high to
|
|
// break to loop if the coordinates are larger than the slice size.
|
|
//
|
|
// This "idx" is the index into `llvm::zip(tids, lvls)`
|
|
for (auto [pred, idx] : slicesPreds) {
|
|
Value nextPos = ADDI(yields[idx], C_IDX(1));
|
|
yields[idx] = SELECT(pred, yields[idx], nextPos);
|
|
}
|
|
|
|
Value pred = slicesPreds.front().first;
|
|
for (int i = 1, e = slicesPreds.size(); i < e; i++) {
|
|
pred = ANDI(pred, slicesPreds[i].first);
|
|
}
|
|
auto ifOp = builder.create<scf::IfOp>(loc, types, pred, /*else*/ true);
|
|
ifOp->setAttr(getLoopEmitterLoopAttrName(),
|
|
StringAttr::get(builder.getContext(), "slice"));
|
|
YIELD(ifOp->getResults());
|
|
assert(types.size() == yields.size());
|
|
// If not all slices are legit
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
YIELD(yields);
|
|
|
|
// If all slices are legit, start the user generated code.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
}
|
|
|
|
Value min;
|
|
// Finds the minimum coordinate
|
|
if (!needsUniv) {
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
|
|
isCompressedWithHiDLT(lvlTp)) {
|
|
const auto crd = coords[tid][lvl];
|
|
if (min) {
|
|
Value cmp = CMPI(ult, coords[tid][lvl], min);
|
|
min = SELECT(cmp, coords[tid][lvl], min);
|
|
} else {
|
|
min = crd;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
assert(!min);
|
|
// Otherwise, universal index is the minimal pos.
|
|
min = after->getArguments().back();
|
|
}
|
|
|
|
// Sets up the loop stack.
|
|
loopStack.emplace_back(tidLvls, ArrayRef<SliceLoopInfo>(), whileOp,
|
|
builder.getInsertionBlock(), min, loopTag);
|
|
assert(loopStack.size() == loopSeqStack.size());
|
|
|
|
for (auto [tid, dstLvl] : unpackTensorLevelRange(tidLvls)) {
|
|
const auto reassoc = getCollapseReassociation(tid, dstLvl);
|
|
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
|
|
// TODO: Refactors this into smaller functions.
|
|
// NOTE: For all the collapsed level (except for the last one, that is why
|
|
// the loop ends with `reassoc.size() - 1`), as each iteration is advanced
|
|
// by the segment size of the last level, which does not always invalidate
|
|
// the segment size for the previous levels, thus we need to propagate the
|
|
// segment sizes across loop iterations and only forward if needed.
|
|
//
|
|
// E.g., for a COO tensor with the following coordinates array.
|
|
// (0, 0, 1),
|
|
// (0, 0, 2),
|
|
// (1, 1, 1),
|
|
// segHi[lvl=0] = segHi[lvl=1] = 2
|
|
// segHi[lvl=2] = 1,
|
|
// the first iteration does not invalidate segHi[0] and segHi[1]
|
|
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
|
|
const Level srcLvl = reassoc[i];
|
|
if (!isUniqueDLT(lvlTypes[tid][srcLvl])) {
|
|
const Value pos = posits[tid][srcLvl];
|
|
const auto oldSegHi = segHi[tid][srcLvl];
|
|
assert(oldSegHi);
|
|
Value newSegHi = builder.create<arith::CmpIOp>(
|
|
loc, arith::CmpIPredicate::uge, pos, oldSegHi);
|
|
auto ifNewSegHi = builder.create<scf::IfOp>(loc, builder.getIndexType(),
|
|
newSegHi, true);
|
|
{
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
builder.setInsertionPointToStart(ifNewSegHi.thenBlock());
|
|
YIELD(genSegmentHigh(builder, loc, tid, srcLvl, pos,
|
|
highs[tid][srcLvl]));
|
|
// Else, resues the same segment high.
|
|
builder.setInsertionPointToStart(ifNewSegHi.elseBlock());
|
|
YIELD(oldSegHi);
|
|
}
|
|
highs[tid][srcLvl + 1] = segHi[tid][srcLvl] = ifNewSegHi.getResult(0);
|
|
}
|
|
};
|
|
const auto srcLvl = reassoc.back();
|
|
if (!isUniqueDLT(lvlTypes[tid][srcLvl])) {
|
|
segHi[tid][srcLvl] = genSegmentHigh(
|
|
builder, loc, tid, srcLvl, posits[tid][srcLvl], highs[tid][srcLvl]);
|
|
}
|
|
}
|
|
|
|
// Emits extra locals
|
|
emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tidLvls);
|
|
|
|
// Updates reduction variables
|
|
assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0));
|
|
// In-place update on reduction variable.
|
|
for (unsigned i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = after->getArgument(o + i);
|
|
|
|
return whileOp;
|
|
}
|
|
|
|
void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level dstLvl) {
|
|
assert(isValidLevel(tid, dstLvl));
|
|
const auto lvlTp = lvlTypes[tid][dstLvl];
|
|
|
|
if (isDenseDLT(lvlTp))
|
|
return;
|
|
|
|
const Value c0 = C_IDX(0);
|
|
const Value c1 = C_IDX(1);
|
|
for (const Level srcLvl : getCollapseReassociation(tid, dstLvl)) {
|
|
// Either the first level, or the previous level has been set.
|
|
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
|
|
assert(srcLvl == 0 || posits[tid][srcLvl - 1]);
|
|
if (isDenseDLT(lvlTp))
|
|
continue;
|
|
if (isCompressedDLT(lvlTp) || isCompressedWithHiDLT(lvlTp)) {
|
|
const Value mem = positionsBuffers[tid][srcLvl];
|
|
|
|
Value pLo = srcLvl == 0 ? c0 : posits[tid][srcLvl - 1];
|
|
if (isCompressedWithHiDLT(lvlTp))
|
|
pLo = builder.create<arith::MulIOp>(loc, pLo, C_IDX(2));
|
|
posits[tid][srcLvl] = genIndexLoad(builder, loc, mem, pLo);
|
|
|
|
const Value pHi = ADDI(pLo, c1);
|
|
highs[tid][srcLvl] = genIndexLoad(builder, loc, mem, pHi);
|
|
return;
|
|
}
|
|
if (isSingletonDLT(lvlTp)) {
|
|
const Value pLo = srcLvl == 0 ? c0 : posits[tid][srcLvl - 1];
|
|
posits[tid][srcLvl] = pLo;
|
|
|
|
// If we are coiterating non-unique levels, then use pHi=segHi;
|
|
// otherwise use pHi=pLo+1.
|
|
// NOTE: Just because the level is non-unique, that does not
|
|
// guarantee that segHi is defined: because we only generate segHi
|
|
// whenever coiterating, in order to improve code quality for the
|
|
// non-coiterating cases.
|
|
const auto parentSegHi = segHi[tid][srcLvl - 1];
|
|
highs[tid][srcLvl] =
|
|
(!isUniqueDLT(lvlTypes[tid][srcLvl - 1]) && parentSegHi)
|
|
? parentSegHi
|
|
: ADDI(pLo, c1);
|
|
return;
|
|
}
|
|
}
|
|
|
|
llvm_unreachable("Unrecognized level-type!");
|
|
}
|
|
|
|
void LoopEmitter::emitExtraLocalsForTensorsAtDenseLvls(
|
|
OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls) {
|
|
// Initialize dense positions. Note that we generate dense coordinates of the
|
|
// output tensor unconditionally, since they may not appear in the lattice,
|
|
// but may be needed for linearized codegen.
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
if (isDenseDLT(lvlTypes[tid][lvl])) {
|
|
// Slice-driven dense level should have be handled already.
|
|
if (!dependentLvlMap[tid][lvl].empty())
|
|
continue;
|
|
|
|
auto enc = getSparseTensorEncoding(tensors[tid].getType());
|
|
if (enc && !isSparseOutput(tid)) {
|
|
bool validPos = lvl == 0 || posits[tid][lvl - 1];
|
|
if (!validPos) {
|
|
// We might not find the pos for the sparse output tensor as it is
|
|
// unconditionally required by the sparsification.
|
|
assert(isOutputTensor(tid));
|
|
continue;
|
|
}
|
|
posits[tid][lvl] =
|
|
genAddress(builder, loc, tid, lvl, loopStack.back().iv);
|
|
// NOTE: we can also prepare for next lvl here in advance
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock);
|
|
for (auto [tid, lvl, reduced] : loopInfo.sliceDrivenInfo) {
|
|
SliceInfo &info = sliceStack[tid].back();
|
|
assert(isDenseDLT(lvlTypes[tid][lvl]));
|
|
assert(*info.slicedOnLvl == lvl && !reduced);
|
|
(void)reduced;
|
|
// Resets slices pointers as the resolved slices are invalidated after we
|
|
// moves forward to the next slice.
|
|
invalidateSliceIterIdx(rewriter, loc, tid, lvl);
|
|
info.minCrd = info.offset = info.isNonEmpty = Value();
|
|
levelReducedDep[tid][lvl]--;
|
|
}
|
|
if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) {
|
|
if (!reduc.empty()) {
|
|
assert(reduc.size() == forOp.getNumResults());
|
|
rewriter.create<scf::YieldOp>(loc, reduc);
|
|
}
|
|
// Exit the loop.
|
|
rewriter.setInsertionPointAfter(forOp);
|
|
// In-place update reduction variables.
|
|
for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
|
|
reduc[i] = forOp.getResult(i);
|
|
} else {
|
|
auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
|
|
if (!reduc.empty()) {
|
|
assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
|
|
Operation *redExp = reduc.front().getDefiningOp();
|
|
// Reduction expression should have no use.
|
|
assert(redExp->getUses().empty());
|
|
// This must be a binary operation.
|
|
// NOTE: This is users' responsibilty to ensure the operation are
|
|
// commutative.
|
|
assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
|
|
|
|
Value redVal = parOp.getInitVals().front();
|
|
Value curVal;
|
|
if (redExp->getOperand(0) == redVal)
|
|
curVal = redExp->getOperand(1);
|
|
else if (redExp->getOperand(1) == redVal)
|
|
curVal = redExp->getOperand(0);
|
|
// One of the operands must be the init value (which is also the
|
|
// previous reduction value).
|
|
assert(curVal);
|
|
#ifndef NDEBUG
|
|
// The reduction expression should be the only user of the reduction val
|
|
// inside the parallel for.
|
|
unsigned numUsers = 0;
|
|
for (Operation *op : redVal.getUsers()) {
|
|
if (op->getParentOp() == parOp)
|
|
numUsers++;
|
|
}
|
|
assert(numUsers == 1);
|
|
#endif // NDEBUG
|
|
|
|
rewriter.setInsertionPointAfter(redExp);
|
|
auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
|
|
// Attach to the reduction op.
|
|
Block *redBlock = &redOp.getRegion().getBlocks().front();
|
|
rewriter.setInsertionPointToEnd(redBlock);
|
|
Operation *newRed = rewriter.clone(*redExp);
|
|
// Replaces arguments of the reduction expression by using the block
|
|
// arguments from scf.reduce.
|
|
rewriter.updateRootInPlace(
|
|
newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
|
|
// Erases the out-dated reduction expression.
|
|
rewriter.eraseOp(redExp);
|
|
rewriter.setInsertionPointToEnd(redBlock);
|
|
rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
|
|
}
|
|
rewriter.setInsertionPointAfter(parOp);
|
|
// In-place update reduction variables.
|
|
for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
|
|
reduc[i] = parOp.getResult(i);
|
|
}
|
|
|
|
// Finished iterating a tensor, clean up
|
|
// We only do the clean up on for loop as while loops do not necessarily
|
|
// finish the iteration on a sparse tensor
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.tidLvls)) {
|
|
// Reset to null.
|
|
coords[tid][lvl] = Value();
|
|
posits[tid][lvl] = Value();
|
|
// Dense level, high is fixed.
|
|
if (!isDenseDLT(lvlTypes[tid][lvl]))
|
|
highs[tid][lvl] = Value();
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop);
|
|
builder.setInsertionPointToEnd(loopInfo.userCodeBlock);
|
|
Value iv = loopInfo.iv;
|
|
|
|
// Finalize the induction. Note that the induction could be performed
|
|
// in the individual if-branches to avoid re-evaluating the conditions.
|
|
// However, that would result in a rather elaborate forest of yield
|
|
// instructions during code generation. Moreover, performing the induction
|
|
// after the if-statements more closely resembles code generated by TACO.
|
|
unsigned o = 0;
|
|
SmallVector<Value> operands;
|
|
unsigned delta = 0;
|
|
for (auto [tid, lvl, resolved] : loopInfo.sliceDrivenInfo) {
|
|
// TODO: handle dense.
|
|
assert(isCompressedDLT(lvlTypes[tid][lvl]));
|
|
levelReducedDep[tid][lvl]--;
|
|
if (!resolved) {
|
|
genSliceNextInduction(builder, loc, whileOp, tid, lvl, operands, o);
|
|
continue;
|
|
}
|
|
// TODO: We need to distinguish coiterate loop with slice-driven loop and
|
|
// fully reduced while op for iterating one slices.
|
|
// FIXME: since we didn't implement coiteration, this must be iteration
|
|
// just on fully resolved slice.
|
|
assert(loopInfo.sliceDrivenInfo.size() == 1 && loopInfo.tidLvls.empty());
|
|
// The if guard to filter out out-range coordinates.
|
|
assert(llvm::isa<scf::IfOp>(builder.getInsertionBlock()->getParentOp()));
|
|
posits[tid][lvl] = whileOp->getResult(o++);
|
|
// FIXME: we are not using continue here since we do not support
|
|
// coiteration on slices. But it need to be treated similarly as the
|
|
// universal index.
|
|
o++; // skip continue flag.
|
|
// Since we did not push two results from whileOp. The size of the
|
|
// operands vector is smaller than the actual number of return values from
|
|
// the whileOp.
|
|
// It is because we are actually generating yield in the IfOp inside the
|
|
// whileOp to only iterates over inbound coordinates within the slices.
|
|
delta += 2;
|
|
};
|
|
|
|
Value one = C_IDX(1);
|
|
for (auto [tid, dstLvl] : unpackTensorLevelRange(loopInfo.tidLvls)) {
|
|
const auto lvlTp = lvlTypes[tid][dstLvl];
|
|
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
|
|
isCompressedWithHiDLT(lvlTp)) {
|
|
const auto reassoc = getCollapseReassociation(tid, dstLvl);
|
|
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
|
|
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
|
|
const Level srcLvl = reassoc[i];
|
|
if (!isUniqueDLT(lvlTypes[tid][srcLvl])) {
|
|
operands.push_back(segHi[tid][srcLvl]);
|
|
o++;
|
|
}
|
|
}
|
|
const Value crd = coords[tid][dstLvl];
|
|
const Value pos = posits[tid][dstLvl];
|
|
Value cmp = CMPI(eq, crd, iv);
|
|
// If the loop contains a coiteration with non-unique level, we fast
|
|
// forward all the duplicated coords by setting the position to the
|
|
// segment high.
|
|
Value add = !isUniqueDLT(lvlTypes[tid][reassoc.back()])
|
|
? segHi[tid][reassoc.back()]
|
|
: ADDI(pos, one);
|
|
|
|
operands.push_back(SELECT(cmp, add, pos));
|
|
// Following loops continue iteration from the break point of the
|
|
// current while loop.
|
|
const Value newPos = whileOp->getResult(o++);
|
|
// We need to define a new local variable for `tid` to avoid
|
|
// warnings about "captured structured bindings are a C++20 extension".
|
|
// FIXME(wrengr): define a helper function to capture this idiom!
|
|
const TensorId newTid = tid;
|
|
llvm::for_each(reassoc, [this, newTid, newPos](Level srcLvl) {
|
|
posits[newTid][srcLvl] = newPos;
|
|
});
|
|
// The coordinate is invalid now.
|
|
coords[tid][dstLvl] = nullptr;
|
|
// The segment high is invalid now.
|
|
segHi[tid][dstLvl] = nullptr;
|
|
// highs remains unchanged.
|
|
}
|
|
}
|
|
|
|
// Reduction value from users.
|
|
for (auto &i : reduc) {
|
|
operands.push_back(i);
|
|
// In place update reduction variable.
|
|
i = whileOp->getResult(o++);
|
|
}
|
|
|
|
// An (optional) universal index.
|
|
if (operands.size() + delta < whileOp.getNumResults()) {
|
|
assert(operands.size() + delta + 1 == whileOp.getNumResults());
|
|
// The last one is the universial index.
|
|
operands.push_back(ADDI(iv, one));
|
|
// update the loop starting point of current loop sequence
|
|
loopSeqStack.back().first = whileOp->getResult(o++);
|
|
}
|
|
|
|
assert(o == operands.size() + delta);
|
|
YIELD(operands);
|
|
builder.setInsertionPointAfter(whileOp);
|
|
}
|
|
|
|
void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
// Clean up the values, it would help use to discover potential bug at a
|
|
// earlier stage (instead of silently using a wrong value).
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
SmallVector<Value> red;
|
|
if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
|
|
exitWhileLoop(rewriter, loc, reduc);
|
|
} else {
|
|
exitForLoop(rewriter, loc, reduc);
|
|
}
|
|
|
|
assert(loopStack.size() == loopSeqStack.size());
|
|
loopStack.pop_back();
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Slice-driven loop related methods.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
unsigned LoopEmitter::remDepOnLevel(TensorId tid, Level lvl) const {
|
|
unsigned totalDependencies = dependentLvlMap[tid][lvl].size();
|
|
if (totalDependencies != 0) {
|
|
assert(totalDependencies >= 2);
|
|
return totalDependencies - levelReducedDep[tid][lvl];
|
|
}
|
|
return totalDependencies;
|
|
}
|
|
|
|
const LoopEmitter::SliceInfo &LoopEmitter::getMostRecentSliceOnLvl(TensorId tid,
|
|
Level lvl) {
|
|
// Finds the most-recent slice using a reverse iteration.
|
|
for (auto it = sliceStack[tid].rbegin(), ie = sliceStack[tid].rend(); it < ie;
|
|
it++) {
|
|
if (it->slicedOnLvl == lvl) { // the level matched
|
|
return *it;
|
|
}
|
|
}
|
|
llvm_unreachable("Failed to find sliceInfo");
|
|
}
|
|
|
|
// Generates a while loop to iterate over a slice sparse level as follows.
|
|
//
|
|
// while(loopLo < loopHi) {
|
|
// if (coords[loopLo] < offset + size) {
|
|
// body_builder
|
|
// } else {
|
|
// break;
|
|
// }
|
|
// loopLo ++;
|
|
// }
|
|
std::pair<Operation *, ValueRange> LoopEmitter::genSliceLvlTraverseLoop(
|
|
OpBuilder &builder, Location loc, Value loopLo, Value loopHi, Value offset,
|
|
Value size, TensorId tid, Level lvl, ValueRange userReduc, bool genYield,
|
|
LoopBodyBuilder bodyBuilder) {
|
|
Value c1 = C_IDX(1);
|
|
Value sliceHi = ADDI(offset, sliceSizes[tid][lvl].back());
|
|
|
|
SmallVector<Value> reduc = {
|
|
loopLo, // loop lower bounds
|
|
constantI1(builder, loc, true), // continue
|
|
};
|
|
// Append user required reduction value.
|
|
reduc.append(userReduc.begin(), userReduc.end());
|
|
scf::WhileOp whileOp = builder.create<scf::WhileOp>(
|
|
loc, ValueRange(reduc).getTypes(), reduc,
|
|
/*beforeBuilder=*/
|
|
[loopHi](OpBuilder &builder, Location loc, ValueRange args) {
|
|
Value lo = args[0];
|
|
Value cont = args[1];
|
|
Value inBound = CMPI(ult, lo, loopHi);
|
|
Value cond = ANDI(cont, inBound);
|
|
// continue if not yet break nor out of bound.
|
|
builder.create<scf::ConditionOp>(loc, cond, args);
|
|
},
|
|
/*afterBuilder=*/
|
|
[this, c1, tid, lvl, sliceHi, genYield,
|
|
bodyBuilder](OpBuilder &builder, Location loc, ValueRange args) {
|
|
Value iv = args[0];
|
|
Value coord =
|
|
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], iv);
|
|
Value cont = CMPI(ult, coord, sliceHi);
|
|
TypeRange types = args.drop_front(2).getTypes();
|
|
|
|
auto ifOp = builder.create<scf::IfOp>(loc, types, cont, true);
|
|
{
|
|
// 2 reduction variable maintained by us.
|
|
SmallVector<Value> ifRet = args.drop_front(2);
|
|
assert(ifRet.size() == args.size() - 2);
|
|
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// If coord >= sliceHi.
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
YIELD(ifRet);
|
|
|
|
// If coord < sliceHi.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
// Delegates to users' callback.
|
|
bodyBuilder(builder, loc, iv, ifRet);
|
|
if (genYield) {
|
|
builder.setInsertionPointToEnd(&ifOp.getThenRegion().front());
|
|
YIELD(ifRet);
|
|
}
|
|
}
|
|
// Marks this speical ifOp to avoid sparisification finalizing it.
|
|
ifOp->setAttr(getLoopEmitterLoopAttrName(),
|
|
StringAttr::get(builder.getContext(), "slice"));
|
|
// Insertion point restored to after ifOp.
|
|
SmallVector<Value> yields;
|
|
// Increase induction variable.
|
|
yields.push_back(ADDI(iv, c1));
|
|
yields.push_back(cont);
|
|
yields.append(ifOp.getResults().begin(), ifOp.getResults().end());
|
|
YIELD(yields);
|
|
});
|
|
|
|
builder.setInsertionPointAfter(whileOp);
|
|
return std::make_pair(whileOp, whileOp.getResults().drop_front(2));
|
|
}
|
|
|
|
// Generates a loop nest that traverse all the unresolved levels in between.
|
|
// TODO: it can only handle all compressed tensors.
|
|
//
|
|
// for(int i = 0; i < slicePos.size(); i+=2) {
|
|
// loopLo = slicePos[i];
|
|
// loopHi = slicePos[i + 1];
|
|
//
|
|
// // Then the same loop generated by genSliceLvlTraverse above.
|
|
// while (loopLo < loopHI) {
|
|
// if (pos[loopLo] < sliceHi) {
|
|
// bodyBuilder();
|
|
// } else {
|
|
// break;
|
|
// }
|
|
// loopLo ++;
|
|
// }
|
|
// }
|
|
ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse(
|
|
OpBuilder &builder, Location loc, TensorId tid,
|
|
ArrayRef<const SliceInfo *> unResLvls,
|
|
std::optional<std::pair<TensorId, Level>> firstResLvl, ValueRange userReduc,
|
|
LoopBodyBuilder bodyBuilder) {
|
|
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
|
|
Value pos = c0;
|
|
OpBuilder::InsertPoint ip;
|
|
SmallVector<Value> innerArgs(userReduc.begin(), userReduc.end());
|
|
scf::ForOp outerMost = nullptr; // the outtermost loop.
|
|
if (firstResLvl.has_value()) {
|
|
// Overwrite position when the first level is fully resolved.
|
|
pos = posits[firstResLvl->first][firstResLvl->second];
|
|
ip = builder.saveInsertionPoint();
|
|
} else {
|
|
const SliceInfo &frontSlice = *unResLvls.back();
|
|
Level firstLvl = *frontSlice.slicedOnLvl;
|
|
if (!lvlFullyResolved(tid, firstLvl)) {
|
|
if (isCompressedDLT(lvlTypes[tid][firstLvl])) {
|
|
unsigned depth = frontSlice.depth - 1;
|
|
Value offset = frontSlice.offset;
|
|
Value sPtrBuf = slicePosBuffer[tid][firstLvl][depth];
|
|
Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize
|
|
outerMost = builder.create<scf::ForOp>(
|
|
loc, c2, mSz, c2, innerArgs,
|
|
[this, c1, tid, firstLvl, offset, sPtrBuf, &ip, &pos,
|
|
&innerArgs](OpBuilder &builder, Location loc, Value iv,
|
|
ValueRange iterArgs) {
|
|
// generate traversal for each level.
|
|
Value loopLo = genIndexLoad(builder, loc, sPtrBuf, iv);
|
|
Value loopHi = genIndexLoad(builder, loc, sPtrBuf, ADDI(iv, c1));
|
|
ValueRange itArgs =
|
|
genSliceLvlTraverseLoop(
|
|
builder, loc, loopLo, loopHi, offset,
|
|
sliceSizes[tid][firstLvl].back(), tid, firstLvl, iterArgs,
|
|
false,
|
|
[&](OpBuilder &builder, Location, Value iv,
|
|
MutableArrayRef<Value> reduc) {
|
|
ip = builder.saveInsertionPoint();
|
|
pos = iv;
|
|
innerArgs.assign(reduc.begin(), reduc.end());
|
|
})
|
|
.second;
|
|
YIELD(itArgs);
|
|
});
|
|
} else if (isDenseDLT(lvlTypes[tid][firstLvl])) {
|
|
assert(firstLvl == 0); // This must be the first level.
|
|
Value lb = frontSlice.offset;
|
|
Value sliceSz =
|
|
sliceSizes[tid][*frontSlice.slicedOnLvl][frontSlice.depth - 1];
|
|
Value ub = ADDI(lb, sliceSz);
|
|
outerMost = builder.create<scf::ForOp>(
|
|
loc, lb, ub, c1, innerArgs,
|
|
[&](OpBuilder &builder, Location loc, Value iv,
|
|
ValueRange iterArgs) {
|
|
ip = builder.saveInsertionPoint();
|
|
pos = iv;
|
|
innerArgs.assign(iterArgs.begin(), iterArgs.end());
|
|
});
|
|
}
|
|
// We generated the loop for the first slice above, now remove it.
|
|
unResLvls = unResLvls.drop_back();
|
|
}
|
|
}
|
|
// Reset the insertion point into the loop body.
|
|
builder.restoreInsertionPoint(ip);
|
|
if (!unResLvls.empty()) {
|
|
// Fills in dense slices levels in between.
|
|
SmallVector<Value> lbs, ubs, steps, lvlSzs;
|
|
for (const SliceInfo *slice : llvm::reverse(unResLvls)) {
|
|
Level sliceLvl = *slice->slicedOnLvl;
|
|
assert(isDenseDLT(lvlTypes[tid][sliceLvl]));
|
|
Value offset = slice->offset;
|
|
Value sliceSz = sliceSizes[tid][sliceLvl][slice->depth - 1];
|
|
lbs.push_back(offset);
|
|
ubs.push_back(ADDI(offset, sliceSz));
|
|
steps.push_back(c1);
|
|
lvlSzs.push_back(lvlSizes[tid][sliceLvl]);
|
|
}
|
|
auto denseNest =
|
|
scf::buildLoopNest(builder, loc, lbs, ubs, steps, innerArgs,
|
|
[&innerArgs, &lvlSzs, &pos, bodyBuilder](
|
|
OpBuilder &builder, Location loc, ValueRange ivs,
|
|
ValueRange iterArgs) -> scf::ValueVector {
|
|
for (auto em : llvm::enumerate(ivs)) {
|
|
// Linearizes postion: pos = (pos * lvlsize) +
|
|
// iv;
|
|
pos = MULI(pos, lvlSzs[em.index()]);
|
|
pos = ADDI(pos, em.value());
|
|
}
|
|
innerArgs.assign(iterArgs.begin(), iterArgs.end());
|
|
// Generates user request loop body.
|
|
bodyBuilder(builder, loc, pos, innerArgs);
|
|
return innerArgs;
|
|
});
|
|
|
|
if (!outerMost) {
|
|
// If the outermost loop has not been set, this is the outermost loop.
|
|
outerMost = denseNest.loops.front();
|
|
} else {
|
|
// Otherwise we need to generate yield operations to link the SSA chain.
|
|
YIELD(denseNest.results);
|
|
}
|
|
} else {
|
|
assert(outerMost);
|
|
// Generates user request loop body.
|
|
bodyBuilder(builder, loc, pos, innerArgs);
|
|
YIELD(innerArgs);
|
|
}
|
|
assert(outerMost);
|
|
// Insert after current while operation.
|
|
builder.setInsertionPointAfter(outerMost);
|
|
return outerMost.getResults();
|
|
}
|
|
|
|
void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2), c3 = C_IDX(3),
|
|
c4 = C_IDX(4);
|
|
if (isDenseDLT(lvlTypes[tid][lvl])) {
|
|
// Dense slice begin is trivial.
|
|
sliceStack[tid].emplace_back(/*minCoord=*/c0, /*offset=*/c0,
|
|
/*nonEmpty=*/constantI1(builder, loc, true),
|
|
lvl, /*depth=*/1);
|
|
return;
|
|
}
|
|
Value size = sliceSizes[tid][lvl][0];
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl][0];
|
|
Value pHi, pLo;
|
|
if (lvl == 0) {
|
|
pLo = c0;
|
|
pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1);
|
|
} else {
|
|
pLo = genIndexLoad(builder, loc, positionsBuffers[tid][lvl],
|
|
posits[tid][lvl - 1]);
|
|
pHi = genIndexLoad(builder, loc, positionsBuffers[tid][lvl],
|
|
ADDI(posits[tid][lvl - 1], c1));
|
|
}
|
|
// Fills out pIdxBuffer[tid][lvl][0] with [/*memSize =*/4, 0, 0, pHi]
|
|
builder.create<memref::StoreOp>(loc, c4, sPtrBuf, c0); // memSize = 4
|
|
builder.create<memref::StoreOp>(loc, c0, sPtrBuf, c1); // index = 0
|
|
builder.create<memref::StoreOp>(loc, pLo, sPtrBuf, c2); // pLo
|
|
builder.create<memref::StoreOp>(loc, pHi, sPtrBuf, c3); // pHi
|
|
|
|
// This is an non empty tensor if 0 < pHi.
|
|
Value isNonEmpty = CMPI(ult, c0, pHi);
|
|
// The minimal coord must be at the first on ordered level.
|
|
// FIXME: Technically we should load the coord only when the slice is
|
|
// nonempty. though we assume that even on empty sparse tensors, a non-empty
|
|
// ptr/idx buffer is allocated for each level so it would not cause OOB to
|
|
// avoid generating a ifOp here.
|
|
Value minCrd = genIndexLoad(builder, loc, coordinatesBuffers[tid][0], c0);
|
|
|
|
// FIXME: We need the relative offset related to the base slice.
|
|
Value absOffset = offsetFromMinCoord(builder, loc, minCrd, size, isNonEmpty);
|
|
sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, lvl, /*depth=*/1);
|
|
}
|
|
|
|
// Fills in the slicePosBuffer before slice-driven loop begin.
|
|
// TODO: it can only handle all compressed tensors.
|
|
//
|
|
// // Loop generated by `genUnResolvedSliceTreeTraverse`
|
|
// for(int i = 0; i < slicePos.size(); i+=2) {
|
|
// loopLo = slicePos[i];
|
|
// loopHi = slicePos[i + 1];
|
|
// minCrd = max;
|
|
// while (loopLo < loopHi) {
|
|
// if (pos[loopLo] < sliceHi) {
|
|
// // bodyBuilder
|
|
// slicePos[tid].push_back(pos[loopLo]);
|
|
// slicePos[tid].push_back(pos[loopLo + 1]);
|
|
// minCrd = min(minCrd, crd[pos[loopLo]]);
|
|
// } else {
|
|
// break;
|
|
// }
|
|
// loopLo ++;
|
|
// }
|
|
// }
|
|
void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
|
|
unsigned depth = levelReducedDep[tid][lvl];
|
|
Value size = sliceSizes[tid][lvl][depth];
|
|
// Dense slice begin is trivial
|
|
if (isDenseDLT(lvlTypes[tid][lvl])) {
|
|
sliceStack[tid].emplace_back(c0, c0, constantI1(builder, loc, false), lvl,
|
|
depth + 1);
|
|
return;
|
|
}
|
|
|
|
assert(isCompressedDLT(lvlTypes[tid][lvl]));
|
|
// Unhandled Cases:
|
|
//
|
|
// 1st, lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one
|
|
// variable need to be reduced on the same level).
|
|
//
|
|
// 2nd, lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a
|
|
// simple dim expression in between).
|
|
assert(lvl == *sliceStack[tid].back().slicedOnLvl + 1);
|
|
|
|
// Check slice stack integrity.
|
|
assert(slicePosBuffer[tid][lvl - 1].size() == sliceStack[tid].back().depth);
|
|
|
|
SmallVector<const SliceInfo *> unResSlices;
|
|
std::optional<std::pair<TensorId, Level>> firstResLvl;
|
|
for (Level curLvl = lvl; curLvl >= 1; curLvl--) {
|
|
Level prevLvl = curLvl - 1;
|
|
if (lvlFullyResolved(tid, prevLvl)) {
|
|
firstResLvl = std::make_pair(tid, prevLvl);
|
|
break;
|
|
}
|
|
unResSlices.push_back(&getMostRecentSliceOnLvl(tid, prevLvl));
|
|
if (!isDenseDLT(lvlTypes[tid][prevLvl])) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
assert(!unResSlices.empty() &&
|
|
!lvlFullyResolved(tid, *unResSlices.front()->slicedOnLvl));
|
|
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl].back();
|
|
SmallVector<Value, 3> reduc = {
|
|
constantI1(builder, loc, false), // isNonEmpty
|
|
lvlSizes[tid][lvl], // minCoord
|
|
c2, // memSize
|
|
};
|
|
|
|
ValueRange result = genUnResolvedSliceTreeTraverse(
|
|
builder, loc, tid, unResSlices, firstResLvl, reduc,
|
|
[this, c1, c2, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc,
|
|
Value iv,
|
|
MutableArrayRef<Value> reduc) {
|
|
Value &nonEmpty = reduc[0];
|
|
Value &minCrd = reduc[1];
|
|
Value &curMemSz = reduc[2];
|
|
|
|
Value pHi = ADDI(iv, c1);
|
|
Value sPLo = genIndexLoad(builder, loc, positionsBuffers[tid][lvl], iv);
|
|
Value sPHi =
|
|
genIndexLoad(builder, loc, positionsBuffers[tid][lvl], pHi);
|
|
|
|
// isNonEmpty = isNonEmpty || lvlNonEmpty, i.e., as long as there is one
|
|
// non-empty lvl, the slice is non-empty.
|
|
Value lvlNonEmpty = CMPI(ult, sPLo, sPHi);
|
|
nonEmpty = builder.create<arith::OrIOp>(loc, lvlNonEmpty, nonEmpty);
|
|
|
|
// Update the minimum coordinate.
|
|
auto ifNonEmpty = builder.create<scf::IfOp>(loc, builder.getIndexType(),
|
|
lvlNonEmpty, true);
|
|
{
|
|
// Generate Code as follows.
|
|
//
|
|
// if (nonEmpty) {
|
|
// minCrd = min(minCrd, crd[pos[pLo]]);
|
|
// }
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
builder.setInsertionPointToStart(ifNonEmpty.thenBlock());
|
|
Value curC =
|
|
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], sPLo);
|
|
Value isSmaller = CMPI(ult, curC, minCrd);
|
|
Value newMin = SELECT(isSmaller, curC, minCrd);
|
|
YIELD(newMin);
|
|
builder.setInsertionPointToStart(ifNonEmpty.elseBlock());
|
|
YIELD(minCrd);
|
|
}
|
|
minCrd = ifNonEmpty.getResult(0);
|
|
builder.create<memref::StoreOp>(loc, sPLo, sPtrBuf, curMemSz);
|
|
Value nxtMemSize = ADDI(curMemSz, c1);
|
|
builder.create<memref::StoreOp>(loc, sPHi, sPtrBuf, nxtMemSize);
|
|
// curMemSize += 2
|
|
curMemSz = ADDI(curMemSz, c2);
|
|
});
|
|
|
|
Value isNonEmpty = result[0];
|
|
Value minCrd = result[1];
|
|
// Two metadata [memSize, idx].
|
|
// TODO: Can use an SSA value for these two metadata
|
|
builder.create<memref::StoreOp>(loc, result[2], sPtrBuf, c0);
|
|
builder.create<memref::StoreOp>(loc, c0, sPtrBuf, c1);
|
|
// FIXME: we need the relative offset related to the base slice.
|
|
Value absOffset = offsetFromMinCoord(builder, loc, minCrd, size, isNonEmpty);
|
|
sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, lvl, depth + 1);
|
|
}
|
|
|
|
bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level lvl) {
|
|
Value c1 = C_IDX(1), c2 = C_IDX(2);
|
|
|
|
if (depFullyReduced(tid, lvl)) {
|
|
// Do not need to prepare for slice driven loop on dense level after it is
|
|
// fully reduced.
|
|
if (isDenseDLT(lvlTypes[tid][lvl]))
|
|
return true;
|
|
// If constraints on the tensor is fully resolved. We do not need to
|
|
// generates slice begin any more, instead we fall back to TACO-based
|
|
// algorithm to (co)iterates over the slice.
|
|
Value pLoPtr =
|
|
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), c1);
|
|
pLoPtr = ADDI(pLoPtr, c2);
|
|
Value pHiPtr = ADDI(pLoPtr, c1);
|
|
posits[tid][lvl] =
|
|
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), pLoPtr);
|
|
highs[tid][lvl] =
|
|
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), pHiPtr);
|
|
return true;
|
|
}
|
|
|
|
// Only when the level is sorted, the next-non-empty slice can be computed
|
|
// efficiently.
|
|
const DimLevelType lvlType = lvlTypes[tid][lvl];
|
|
assert(isOrderedDLT(lvlType));
|
|
if (isSingletonDLT(lvlType)) {
|
|
llvm_unreachable("TODO: dense level should be easy to support, while "
|
|
"singleton level requres more efforts");
|
|
}
|
|
|
|
assert(!dependentLvlMap[tid][lvl].empty());
|
|
assert(!sliceStack[tid].empty());
|
|
|
|
const SliceInfo &sliceInfo = sliceStack[tid].back();
|
|
auto baseEnc = getSparseTensorEncoding(tensors[tid].getType());
|
|
if (baseEnc.isSlice())
|
|
llvm_unreachable("TODO: not yet implemented");
|
|
|
|
// Generate caches required to fast compute next-non-empty slices with
|
|
// increasing offset for slice-base loop.
|
|
// We do not need cache for dense levels.
|
|
if (slicePosBuffer[tid][lvl][0] == nullptr && !isDenseDLT(lvlType)) {
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// The buffer can be reused, and the size is loop invariant: it only depends
|
|
// on the iteration graph's toposort.
|
|
builder.setInsertionPointAfter(localInsertPos);
|
|
Value bufSize = C_IDX(1);
|
|
Value c2 = C_IDX(2);
|
|
// Accumlates the size required to cache the pLo for the slice.
|
|
// E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the second
|
|
// level. We at most need to a memref<d0xindex>.
|
|
// NOTE: this is apperantly an over-approximation when the previous
|
|
// level is compressed, and we can compute a precise memory size
|
|
// inside the loops. But that would also requires us to allocate/free
|
|
// memorys in loops.
|
|
// TODO: Maybe using allocaScopeOp inside the loop to resolve the issue?
|
|
for (Level curLevel = lvl;
|
|
curLevel >= 1 && !lvlFullyResolved(tid, curLevel - 1); curLevel--) {
|
|
auto depth = remDepOnLevel(tid, curLevel - 1);
|
|
assert(sliceSizes[tid][lvl].size() >= depth);
|
|
Value sz = *(sliceSizes[tid][lvl].rbegin() + depth - 1);
|
|
bufSize = MULI(bufSize, sz);
|
|
}
|
|
// For a pair of [pLo, pHi]. Note that we can not compress pHi because slice
|
|
// creates segments in the index buffer so that the pHi for the current
|
|
// level is no longer the pLo for the next level.
|
|
bufSize = MULI(bufSize, c2);
|
|
// Additional two metadata {memSize, idx} at head.
|
|
bufSize = ADDI(bufSize, c2);
|
|
llvm::for_each(
|
|
slicePosBuffer[tid][lvl], [bufSize, loc, &builder](Value &cache) {
|
|
cache = genAlloca(builder, loc, bufSize, builder.getIndexType());
|
|
});
|
|
}
|
|
|
|
if (sliceInfo.isInitialTensor() ||
|
|
(lvl >= 1 && lvlFullyResolved(tid, lvl - 1))) {
|
|
// First level or previous level has been full resolved.
|
|
genResolvedSliceBegin(builder, loc, tid, lvl);
|
|
} else {
|
|
// The previous level has not been full resolved.
|
|
genUnResolvedSliceBegin(builder, loc, tid, lvl);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void LoopEmitter::invalidateSliceIterIdx(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
for (unsigned i = 0; i <= lvl; i++) {
|
|
if (!isDenseDLT(lvlTypes[tid][i]) && !dependentLvlMap[tid][i].empty()) {
|
|
builder.create<memref::StoreOp>(loc, C_IDX(0),
|
|
slicePosBuffer[tid][i].back(), C_IDX(1));
|
|
}
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc,
|
|
const Operation *op, TensorId tid,
|
|
Level lvl,
|
|
SmallVectorImpl<Value> &operands,
|
|
unsigned &retIdx) {
|
|
if (!isCompressedDLT(lvlTypes[tid][lvl]))
|
|
llvm_unreachable("TODO");
|
|
|
|
// else generate code to compute next non empty slice.
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
|
|
|
|
auto whileOp = llvm::cast<scf::WhileOp>(op);
|
|
SliceInfo &info = sliceStack[tid].back();
|
|
assert(info.slicedOnLvl == lvl);
|
|
//
|
|
// We forward to the next non empty slice by
|
|
// if (minCrd > offset) {
|
|
// offset += 1
|
|
// } else {
|
|
// minCrd = nextMinInSlice();
|
|
// offset = minCrd - size + 1;
|
|
// }
|
|
//
|
|
// if (offset + size > parents.size)
|
|
// isNonEmpty = false;
|
|
//
|
|
Value absOffset = info.offset;
|
|
// Resets slices pointers as the resolved slices are invalidated after we
|
|
// moves forward to the next slice.
|
|
invalidateSliceIterIdx(builder, loc, tid, lvl);
|
|
|
|
SmallVector<Value, 3> reduc = {info.minCrd, info.isNonEmpty, absOffset};
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl][info.depth - 1];
|
|
Value fastPathP = CMPI(ugt, info.minCrd, absOffset);
|
|
auto ifOp = builder.create<scf::IfOp>(loc, ValueRange(reduc).getTypes(),
|
|
fastPathP, true);
|
|
{
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// Take the fast path
|
|
// if (minCrd > offset) {
|
|
// return offset += 1
|
|
// }
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
reduc[2] = ADDI(absOffset, c1);
|
|
// Yield offset + 1.
|
|
YIELD(reduc);
|
|
|
|
// else /*minCrd == offset*/ {
|
|
// for (i = 0; i < slicePos.size(); i+=2) {
|
|
// if (crd[pos[slicePos[i]]] == minCrd) {
|
|
// slicePos[i]++;
|
|
// }
|
|
// minCrd=min(minCrd, crd[pos[slicePos[i]]]);
|
|
// }
|
|
// offset = minCrd - size + 1;
|
|
// }
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
reduc[2] = absOffset; // restore value.
|
|
Value pSt = c2; // pointer starting index
|
|
Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize
|
|
reduc[0] = lvlSizes[tid][lvl]; // next min coord
|
|
reduc[1] = constantI1(builder, loc, false); // isNonEmpty
|
|
auto loopArgs = static_cast<ValueRange>(reduc).drop_back();
|
|
auto forOp = scf::buildLoopNest(
|
|
builder, loc, pSt, mSz, c2, loopArgs,
|
|
[this, tid, lvl, c1, sPtrBuf,
|
|
&info](OpBuilder &builder, Location loc, ValueRange ivs,
|
|
ValueRange iterArgs) -> scf::ValueVector {
|
|
Value curMinCrd = iterArgs[0];
|
|
Value isNonEmpty = iterArgs[1];
|
|
|
|
Type idxTp = builder.getIndexType();
|
|
Value pLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front());
|
|
Value pHi =
|
|
genIndexLoad(builder, loc, sPtrBuf, ADDI(ivs.front(), c1));
|
|
//
|
|
// if (pLo < pHi) // Only loads when inbound.
|
|
// coord = load[pLo]
|
|
// if coord == minCrd
|
|
// pLo += 1
|
|
//
|
|
// if (pLo < pHi)
|
|
// curMinCrd = min(curMinCrd, load[pLo])
|
|
//
|
|
Value pred = CMPI(ult, pLo, pHi);
|
|
auto advPLo = builder.create<scf::IfOp>(loc, idxTp, pred, true);
|
|
/* if pLo < pHi */ {
|
|
builder.setInsertionPointToStart(&advPLo.getThenRegion().front());
|
|
// coord = load[pLo]
|
|
Value coord =
|
|
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo);
|
|
Value pred = CMPI(eq, coord, info.minCrd);
|
|
auto ifEqual = builder.create<scf::IfOp>(loc, idxTp, pred, true);
|
|
/* if coord == minCrd */ {
|
|
builder.setInsertionPointToStart(
|
|
&ifEqual.getThenRegion().front());
|
|
Value newPlo = ADDI(pLo, c1);
|
|
// Updates the cache.
|
|
builder.create<memref::StoreOp>(loc, newPlo, sPtrBuf,
|
|
ivs.front());
|
|
YIELD(newPlo);
|
|
}
|
|
/* else coord != minCrd */ {
|
|
builder.setInsertionPointToStart(
|
|
&ifEqual.getElseRegion().front());
|
|
YIELD(pLo);
|
|
}
|
|
builder.setInsertionPointAfter(ifEqual);
|
|
YIELD(ifEqual.getResults());
|
|
}
|
|
/* else pLo >= pHi */ {
|
|
builder.setInsertionPointToStart(&advPLo.getElseRegion().front());
|
|
YIELD(pLo);
|
|
}
|
|
|
|
builder.setInsertionPointAfter(advPLo);
|
|
pLo = advPLo.getResult(0);
|
|
Value lvlNonEmpty = CMPI(ult, pLo, pHi);
|
|
// Update minCrds
|
|
auto newMin =
|
|
builder.create<scf::IfOp>(loc, idxTp, lvlNonEmpty, true);
|
|
builder.setInsertionPointToStart(&newMin.getThenRegion().front());
|
|
YIELD(genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo));
|
|
|
|
builder.setInsertionPointToStart(&newMin.getElseRegion().front());
|
|
YIELD(curMinCrd);
|
|
builder.setInsertionPointAfter(newMin);
|
|
|
|
// isNonEmpty = isNonEmpty || lvlNonEmpty
|
|
isNonEmpty =
|
|
builder.create<arith::OrIOp>(loc, lvlNonEmpty, isNonEmpty);
|
|
curMinCrd = builder.create<arith::SelectOp>(
|
|
loc, CMPI(ult, newMin.getResult(0), curMinCrd),
|
|
newMin.getResult(0), curMinCrd);
|
|
return {curMinCrd, isNonEmpty};
|
|
});
|
|
|
|
builder.setInsertionPointAfter(forOp.loops.front());
|
|
// minOffset = minCrd + 1 >= size ? minCrd + 1 - size : c0
|
|
Value tmp = ADDI(forOp.results.front(), c1);
|
|
Value minOffset = SUBI(tmp, sliceSizes[tid][lvl][info.depth - 1]);
|
|
Value p = CMPI(uge, tmp, sliceSizes[tid][lvl][info.depth - 1]);
|
|
minOffset = SELECT(p, minOffset, c0);
|
|
SmallVector<Value, 3> yields;
|
|
yields.assign(forOp.results.begin(), forOp.results.end());
|
|
yields.push_back(minOffset);
|
|
YIELD(yields);
|
|
}
|
|
|
|
Value nextMinCrd = ifOp.getResults()[0];
|
|
Value nextNonEmpty = ifOp.getResults()[1];
|
|
|
|
// The next offset should at least be offset + 1;
|
|
Value minOffset = ifOp.getResults()[2];
|
|
Value nxOffset = ADDI(info.offset, c1);
|
|
Value maxPred = CMPI(ugt, minOffset, nxOffset);
|
|
Value nextAbsOffset = SELECT(maxPred, minOffset, nxOffset);
|
|
|
|
Value sliceUB = ADDI(nextAbsOffset, sliceSizes[tid][lvl][info.depth - 1]);
|
|
|
|
// FIXME: this only works if there is only one parent.
|
|
assert(info.depth - 1 == 0);
|
|
// nextNonEmpty = nextNonEmpty && slice upper bound <= parent upperbound.
|
|
nextNonEmpty = ANDI(nextNonEmpty, CMPI(ule, sliceUB, lvlSizes[tid][lvl]));
|
|
|
|
// FIXME: compute relative offset.
|
|
assert(info.depth - 1 == 0);
|
|
Value nextRelOffset = nextAbsOffset;
|
|
nextRelOffset = SELECT(nextNonEmpty, nextRelOffset, c0);
|
|
|
|
operands.push_back(nextNonEmpty);
|
|
operands.push_back(nextMinCrd);
|
|
operands.push_back(nextAbsOffset); // we push the absolute offset.
|
|
|
|
// Update the slice stack.
|
|
info.isNonEmpty = whileOp.getResult(retIdx++);
|
|
info.minCrd = whileOp.getResult(retIdx++);
|
|
info.offset = whileOp.getResult(retIdx++);
|
|
}
|
|
|
|
Operation *LoopEmitter::emitSliceDrivenLoopOverTensorAtLvl(
|
|
OpBuilder &builder, Location loc, TensorId tid, Level lvl,
|
|
MutableArrayRef<Value> reduc) {
|
|
assert(!depFullyReduced(tid, lvl));
|
|
SliceInfo &sliceInfo = sliceStack[tid].back();
|
|
assert(sliceInfo.slicedOnLvl == lvl);
|
|
|
|
// The order matters!
|
|
SmallVector<Value, 3> operands{sliceInfo.isNonEmpty, sliceInfo.minCrd,
|
|
sliceInfo.offset};
|
|
// number of reduction maintained by us.
|
|
size_t numMetaReduc = operands.size();
|
|
|
|
// Append user-required reduction values.
|
|
operands.append(reduc.begin(), reduc.end());
|
|
assert(operands.size() == numMetaReduc + reduc.size());
|
|
|
|
// while (slice.nonEmpty()) {
|
|
// bodyBuilder();
|
|
// SliceNext();
|
|
// }
|
|
auto whileOp = builder.create<scf::WhileOp>(
|
|
loc, ValueRange(operands).getTypes(), operands,
|
|
/*beforeBuilder=*/
|
|
[](OpBuilder &builder, Location loc, ValueRange args) {
|
|
builder.create<scf::ConditionOp>(loc, /*isNonEmpty*/ args[0], args);
|
|
},
|
|
/*afterBuilder=*/
|
|
[this, tid, lvl, reduc, numMetaReduc,
|
|
&sliceInfo](OpBuilder &builder, Location loc, ValueRange args) {
|
|
assert(args.size() == reduc.size() + numMetaReduc);
|
|
sliceInfo.isNonEmpty = args[0];
|
|
sliceInfo.minCrd = args[1];
|
|
sliceInfo.offset = args[2];
|
|
// The slice offset is used to coiterate with other tensors'
|
|
// coordinates.
|
|
Value c = sliceInfo.offset;
|
|
if (sliceInfo.depth > 1) {
|
|
// Coord is the relative offset related to its parents.
|
|
// Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1]
|
|
llvm_unreachable("TODO: not yet implement");
|
|
}
|
|
coords[tid][lvl] = c;
|
|
|
|
for (unsigned i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = args[i + numMetaReduc];
|
|
});
|
|
|
|
// Set the insertion point to while loop body.
|
|
builder.setInsertionPointToEnd(&whileOp.getAfter().front());
|
|
return whileOp;
|
|
}
|
|
|
|
#undef CMPI
|
|
#undef C_IDX
|
|
#undef YIELD
|
|
#undef ADDI
|
|
#undef ANDI
|
|
#undef SUBI
|
|
#undef MULI
|
|
#undef SELECT
|