2234 lines
88 KiB
C++
2234 lines
88 KiB
C++
//===- LoopEmitter.cpp ----------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "LoopEmitter.h"
|
|
#include "CodegenUtils.h"
|
|
|
|
#include "mlir/Dialect/Arith/IR/Arith.h"
|
|
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
|
|
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
|
#include "mlir/Dialect/Linalg/Utils/Utils.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/Dialect/SCF/IR/SCF.h"
|
|
#include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
|
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
#include "mlir/Dialect/Vector/IR/VectorOps.h"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::sparse_tensor;
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// File local shorthand macros
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#define CMPI(p, l, r) \
|
|
(builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, (l), (r)) \
|
|
.getResult())
|
|
|
|
#define C_IDX(v) (constantIndex(builder, loc, (v)))
|
|
#define YIELD(vs) (builder.create<scf::YieldOp>(loc, (vs)))
|
|
#define ADDI(lhs, rhs) (builder.create<arith::AddIOp>(loc, (lhs), (rhs)))
|
|
#define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs)))
|
|
#define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs)))
|
|
#define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs)))
|
|
#define REMUI(lhs, rhs) (builder.create<arith::RemUIOp>(loc, (lhs), (rhs)))
|
|
#define DIVUI(lhs, rhs) (builder.create<arith::DivUIOp>(loc, (lhs), (rhs)))
|
|
#define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r)))
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Debugging utils
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef NDEBUG
|
|
LLVM_ATTRIBUTE_UNUSED static void dumpIndexMemRef(OpBuilder &builder,
|
|
Location loc, Value memref) {
|
|
memref = builder.create<memref::CastOp>(
|
|
loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref);
|
|
createFuncCall(builder, loc, "printMemrefInd", TypeRange{},
|
|
ValueRange{memref}, EmitCInterface::On);
|
|
}
|
|
#endif
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// File local helper functions.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// For index reduction loops, since the tensor are sliced into non-continuous
|
|
// fragments, we need a triple [pLo, pHi, pPtr], in which the pair (pLo, pHi)
|
|
// specifies the range of the fragment, and pPtr specifies the index of the
|
|
// corresponding fragment in the child level (i.e., a pointer to the sliced
|
|
// position array).
|
|
static constexpr unsigned kSliceIterWidth = 3;
|
|
|
|
static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor,
|
|
Level lvl) {
|
|
auto enc = getSparseTensorEncoding(tensor.getType());
|
|
return createOrFoldSliceOffsetOp(builder, loc, tensor, toDim(enc, lvl));
|
|
}
|
|
|
|
static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor,
|
|
Level lvl) {
|
|
auto enc = getSparseTensorEncoding(tensor.getType());
|
|
return createOrFoldSliceStrideOp(builder, loc, tensor, toDim(enc, lvl));
|
|
}
|
|
|
|
/// Converts a coordinate relative to the slice to the coordinate relative
|
|
/// to the underlying tensor.
|
|
// FIXME: that description says "sliceCrd -> tensorCrd"; but the function
|
|
// name suggests it should be "tensorCrd -> sliceCrd".
|
|
static Value toSliceCrd(OpBuilder &builder, Location loc, Value crd,
|
|
Value offset, Value stride, Value tensor, Level lvl) {
|
|
// tensorCrd = sliceCrd * stride + offset
|
|
return ADDI(MULI(crd, stride), offset);
|
|
}
|
|
|
|
/// Generates code to compute the *absolute* offset of the slice based on the
|
|
/// provide minimum coordinates in the slice.
|
|
/// E.g., when reducing d0 + d1 + d2, we need two slices to fully reduced the
|
|
/// expression, i,e, s1 = slice(T, d0), s2 = slice(s1, d1). The *absolute*
|
|
/// offset is the offset computed relative to the initial tensors T.
|
|
///
|
|
/// When isNonEmpty == true, the computed offset is meaningless and should not
|
|
/// be used during runtime, the method generates code to return 0 currently in
|
|
/// that case.
|
|
///
|
|
/// offset = isNonEmpty && minCrd >= size ? minCrd - size + 1 : 0;
|
|
static Value offsetFromMinCoord(OpBuilder &builder, Location loc, Value minCrd,
|
|
Value size, Value isNonEmpty) {
|
|
Value geSize = CMPI(uge, minCrd, size);
|
|
Value pred = ANDI(isNonEmpty, geSize);
|
|
// Computes minCrd - size + 1
|
|
Value mms = SUBI(ADDI(minCrd, C_IDX(1)), size);
|
|
// This is the absolute offset related to the underly tensor.
|
|
return SELECT(pred, mms, C_IDX(0));
|
|
}
|
|
|
|
/// Converts a coordinate relative to the underlying tensor to the coordinate
|
|
/// relative to the slice, returns a extra reminder value
|
|
// FIXME: that description says "tensorCrd -> sliceCrd"; but the function
|
|
// name suggests it should be "sliceCrd -> tensorCrd".
|
|
static std::pair<Value, Value> fromSliceCrd(OpBuilder &builder, Location loc,
|
|
Value crd, Value offset,
|
|
Value stride, Value tensor,
|
|
Level lvl) {
|
|
// sliceCrd = (tensorCrd - offset) / stride
|
|
crd = SUBI(crd, offset);
|
|
Value rem = REMUI(crd, stride);
|
|
crd = DIVUI(crd, stride);
|
|
return std::make_pair(crd, rem);
|
|
}
|
|
|
|
// Generates a bool value for while loop condition that tries to iterate over a
|
|
// fully reduced level with affine index expression.
|
|
static Value genSparseReducedAffineCond(OpBuilder &builder, Location loc,
|
|
const SparseTensorLevel &level,
|
|
Value crdHi, Value posit, Value posHi) {
|
|
Value inBound = CMPI(ult, posit, posHi);
|
|
auto ifOp =
|
|
builder.create<scf::IfOp>(loc, builder.getI1Type(), inBound, true);
|
|
// if (inbound)
|
|
// yield coord < crdHi
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
Value crd = level.peekCrdAt(builder, loc, posit);
|
|
YIELD(CMPI(ult, crd, crdHi));
|
|
// else
|
|
// yield false
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
YIELD(constantI1(builder, loc, false));
|
|
|
|
builder.setInsertionPointAfter(ifOp);
|
|
return ifOp.getResult(0);
|
|
}
|
|
|
|
// Helper functions that load/store into the position buffer for slice-driven
|
|
// loops.
|
|
// The sliced pointer buffer is organized as:
|
|
// [[pLo0, pLo1, pLo2, ...],
|
|
// [pHi0, pHi1, pHi2, ...],
|
|
// [pNx0, pNx1, pNx2, ...]]
|
|
static Value allocSlicePosBuf(OpBuilder &builder, Location loc,
|
|
Value tupleCnt) {
|
|
Value bufSz = MULI(tupleCnt, C_IDX(kSliceIterWidth));
|
|
// Additional two metadata {memSize, idx} at head.
|
|
return genAlloca(builder, loc, bufSz, builder.getIndexType());
|
|
}
|
|
|
|
// Gets and sets position values for slice-driven loops.
|
|
enum class SlicePosKind { kLo, kHi, kNext };
|
|
static Value getSlicePosIdx(OpBuilder &builder, Location loc, Value posBuf,
|
|
Value tupleIdx, SlicePosKind posKind) {
|
|
Value dim = builder.create<memref::DimOp>(loc, posBuf, C_IDX(0));
|
|
Value tupleCnt = DIVUI(dim, C_IDX(kSliceIterWidth));
|
|
switch (posKind) {
|
|
case SlicePosKind::kLo:
|
|
return tupleIdx;
|
|
case SlicePosKind::kHi:
|
|
return ADDI(tupleIdx, tupleCnt);
|
|
case SlicePosKind::kNext:
|
|
return ADDI(tupleIdx, MULI(tupleCnt, C_IDX(2)));
|
|
}
|
|
llvm_unreachable("unexpected kind");
|
|
}
|
|
static Value loadSlicePos(OpBuilder &builder, Location loc, Value sPosBuf,
|
|
Value tupleIdx, SlicePosKind posKind) {
|
|
return genIndexLoad(builder, loc, sPosBuf,
|
|
getSlicePosIdx(builder, loc, sPosBuf, tupleIdx, posKind));
|
|
}
|
|
static void updateSlicePos(OpBuilder &builder, Location loc, Value sPosBuf,
|
|
Value pos, Value tupleIdx, SlicePosKind posKind) {
|
|
builder.create<memref::StoreOp>(
|
|
loc, pos, sPosBuf,
|
|
getSlicePosIdx(builder, loc, sPosBuf, tupleIdx, posKind));
|
|
}
|
|
|
|
std::pair<Value, Value>
|
|
LoopEmitter::genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd,
|
|
TensorId tid, Level lvl) {
|
|
assert(isSparseSlices[tid]);
|
|
Value slice = tensors[tid];
|
|
Value offset = sliceOffsets[tid][lvl];
|
|
Value stride = sliceStrides[tid][lvl];
|
|
auto enc = getSparseTensorEncoding(slice.getType());
|
|
|
|
const auto [newCrd, crdRem] =
|
|
fromSliceCrd(builder, loc, crd, offset, stride, slice, lvl);
|
|
|
|
SmallVector<Value, 3> conds; // at most 3 conditions
|
|
|
|
// First, coord >= offset (skip the check if offset is known to be 0).
|
|
if (auto staticOffset = enc.getStaticLvlSliceOffset(lvl);
|
|
!(staticOffset.has_value() && *staticOffset == 0)) {
|
|
auto geOffset = CMPI(uge, crd, offset);
|
|
conds.push_back(geOffset);
|
|
}
|
|
|
|
// Second, coord_in_slice < length
|
|
auto ltLength = CMPI(ult, newCrd, lvlSizes[tid][lvl]);
|
|
conds.push_back(ltLength);
|
|
|
|
// Third, rem == 0 (skip the check if stride is known to be 1).
|
|
if (auto staticStride = enc.getStaticLvlSliceStride(lvl);
|
|
!(staticStride.has_value() && *staticStride == 1)) {
|
|
auto fitStride = CMPI(eq, crdRem, C_IDX(0));
|
|
conds.push_back(fitStride);
|
|
}
|
|
|
|
// Must meet all condition to be a valid coordinate in slice.
|
|
auto pred = conds.front();
|
|
for (auto cond : ValueRange(conds).drop_front())
|
|
pred = ANDI(pred, cond);
|
|
|
|
return {newCrd, pred};
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Sparse tensor loop emitter class implementations
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level lvl, Value crd) {
|
|
Value pos = lvl == 0 ? C_IDX(0) : posits[tid][lvl - 1];
|
|
Value mul = MULI(highs[tid][lvl], pos);
|
|
if (isSparseSlices[tid])
|
|
crd = toSliceCrd(builder, loc, crd, sliceOffsets[tid][lvl],
|
|
sliceStrides[tid][lvl], tensors[tid], lvl);
|
|
Value add = ADDI(mul, crd);
|
|
return add;
|
|
}
|
|
|
|
Value LoopEmitter::genSegmentHigh(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl, Value pLo,
|
|
Value pHi) {
|
|
SparseTensorLevel &stl = *lvls[tid][lvl];
|
|
const Value sameCrd = stl.peekCrdAt(builder, loc, pLo);
|
|
auto whileOp = builder.create<scf::WhileOp>(
|
|
loc, builder.getIndexType(), pLo,
|
|
/*beforeBuilder=*/
|
|
[pHi, &stl, sameCrd](OpBuilder &builder, Location loc, ValueRange ivs) {
|
|
const auto pos = ivs[0];
|
|
Value inBound = builder.create<arith::CmpIOp>(
|
|
loc, arith::CmpIPredicate::ult, pos, pHi);
|
|
auto ifInBound =
|
|
builder.create<scf::IfOp>(loc, builder.getI1Type(), inBound, true);
|
|
{
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// Load the next coordinates only when inbound (to avoid OOB
|
|
// accesses).
|
|
builder.setInsertionPointToStart(ifInBound.thenBlock());
|
|
Value crd = stl.peekCrdAt(builder, loc, pos);
|
|
Value isSameCrd = builder.create<arith::CmpIOp>(
|
|
loc, arith::CmpIPredicate::eq, crd, sameCrd);
|
|
YIELD(isSameCrd);
|
|
// Else, the position is out of bound, yield false to terminate the
|
|
// loop.
|
|
builder.setInsertionPointToStart(ifInBound.elseBlock());
|
|
YIELD(constantI1(builder, loc, false));
|
|
}
|
|
builder.create<scf::ConditionOp>(loc, ifInBound.getResults()[0], ivs);
|
|
},
|
|
/*afterBuilder=*/
|
|
[](OpBuilder &builder, Location loc, ValueRange ivs) {
|
|
// pos ++
|
|
Value nextPos = ADDI(ivs[0], C_IDX(1));
|
|
YIELD(nextPos);
|
|
});
|
|
// Return the segment high.
|
|
return whileOp.getResult(0);
|
|
}
|
|
|
|
Value LoopEmitter::genSparseCrd(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level lvl) {
|
|
const Value pos = posits[tid][lvl];
|
|
const Value crd = lvls[tid][lvl]->peekCrdAt(builder, loc, pos);
|
|
return crd;
|
|
}
|
|
|
|
LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput,
|
|
bool isSparseOut, unsigned numLoops,
|
|
DependentLvlGetter dimGetter) {
|
|
initialize(tensors, loopTag, hasOutput, isSparseOut, numLoops, dimGetter);
|
|
}
|
|
|
|
void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
|
|
bool isSparseOut, unsigned numLoops,
|
|
DependentLvlGetter dimGetter) {
|
|
// First initialize the top-level type of the fields.
|
|
this->loopTag = loopTag;
|
|
this->hasOutput = hasOutput;
|
|
this->isSparseOut = isSparseOut;
|
|
|
|
const unsigned numManifestTensors = ts.size();
|
|
const unsigned synTensorId = numManifestTensors;
|
|
const unsigned numTensors = numManifestTensors + 1;
|
|
// tensors array (len == numManifestTensor).
|
|
this->tensors.assign(ts.begin(), ts.end());
|
|
// Arrays with len == numTensor.
|
|
this->lvlTypes.assign(numTensors, std::vector<LevelType>());
|
|
this->lvlSizes.assign(numTensors, std::vector<Value>());
|
|
this->highs.assign(numTensors, std::vector<Value>());
|
|
this->segHi.assign(numTensors, std::vector<Value>());
|
|
this->posits.assign(numTensors, std::vector<Value>());
|
|
this->coords.assign(numTensors, std::vector<Value>());
|
|
this->valBuffer.assign(numTensors, nullptr);
|
|
this->lvls.resize(numTensors);
|
|
this->isSparseSlices.assign(numTensors, false);
|
|
this->sliceOffsets.assign(numTensors, std::vector<Value>());
|
|
this->sliceStrides.assign(numTensors, std::vector<Value>());
|
|
|
|
// These zeros will be overwritten below, but we need to initialize
|
|
// them to something since we'll need random-access assignment.
|
|
this->loopStack.reserve(numLoops);
|
|
this->loopSeqStack.reserve(numLoops);
|
|
|
|
// Index-reduction related fields.
|
|
this->dependentLvlMap.assign(
|
|
numTensors, std::vector<std::vector<std::pair<TensorLevel, unsigned>>>());
|
|
this->slicePosBuffer.assign(numTensors, std::vector<std::vector<Value>>());
|
|
this->sliceTupleNxStartIdx.assign(numTensors, std::vector<Value>());
|
|
this->sliceTupleFwdCnt.assign(numTensors, std::vector<Value>());
|
|
this->trivialSlice.assign(numTensors, std::vector<bool>());
|
|
this->sliceMeta.assign(
|
|
numTensors, std::vector<std::vector<std::pair<Value, unsigned>>>());
|
|
this->sliceStack.assign(numTensors, std::vector<SliceInfo>());
|
|
this->levelReducedDep.assign(numTensors, std::vector<unsigned>());
|
|
|
|
// Initialize nested types of `TensorId`-indexed fields.
|
|
for (TensorId tid = 0; tid < numTensors; tid++) {
|
|
Level lvlRank;
|
|
if (tid == synTensorId) {
|
|
// Synthetic tensor (conceptually) is an all-dense tensor with rank equal
|
|
// to the total number of loops (each level can potentially be mapped to
|
|
// one of the loop being generated).
|
|
lvlRank = numLoops;
|
|
lvlTypes[tid].assign(lvlRank, LevelType::Dense);
|
|
} else {
|
|
const Value t = tensors[tid];
|
|
// a scalar or 0-dimension tensors
|
|
if (isZeroRankedTensorOrScalar(t.getType()))
|
|
continue;
|
|
|
|
auto rtp = getRankedTensorType(t);
|
|
const SparseTensorType stt(rtp);
|
|
lvlRank = stt.getLvlRank();
|
|
|
|
if (stt.hasEncoding()) {
|
|
const auto enc = stt.getEncoding();
|
|
isSparseSlices[tid] = enc.isSlice();
|
|
for (auto lvlTp : enc.getLvlTypes())
|
|
lvlTypes[tid].push_back(lvlTp);
|
|
} else {
|
|
lvlTypes[tid].assign(lvlRank, LevelType::Dense);
|
|
}
|
|
}
|
|
|
|
// Initialize using empty value.
|
|
lvlSizes[tid].assign(lvlRank, Value());
|
|
highs[tid].assign(lvlRank, Value());
|
|
segHi[tid].assign(lvlRank, Value());
|
|
posits[tid].assign(lvlRank, Value());
|
|
coords[tid].assign(lvlRank, Value());
|
|
lvls[tid].resize(lvlRank);
|
|
|
|
sliceOffsets[tid].assign(lvlRank, Value());
|
|
sliceStrides[tid].assign(lvlRank, Value());
|
|
|
|
// Slice-driven loops related initialization.
|
|
levelReducedDep[tid].assign(lvlRank, 0);
|
|
dependentLvlMap[tid].assign(
|
|
lvlRank, std::vector<std::pair<TensorLevel, unsigned>>());
|
|
slicePosBuffer[tid].assign(lvlRank, std::vector<Value>());
|
|
sliceTupleNxStartIdx[tid].assign(lvlRank, Value());
|
|
sliceTupleFwdCnt[tid].assign(lvlRank, Value());
|
|
trivialSlice[tid].assign(lvlRank, false);
|
|
sliceMeta[tid].assign(lvlRank, std::vector<std::pair<Value, unsigned>>());
|
|
sliceStack[tid].emplace_back(/*minCrd=*/Value(),
|
|
/*offset=*/Value(), /*isNonEmpty*/ Value(),
|
|
/*posTupleNum=*/Value(), std::nullopt, 0);
|
|
if (dimGetter && !isSynTensor(tid)) {
|
|
for (Level l = 0; l < lvlRank; l++) {
|
|
std::vector<std::pair<LoopId, unsigned>> deps = dimGetter(tid, l);
|
|
// Sort the loop by order.
|
|
std::sort(deps.begin(), deps.end(),
|
|
[](auto &lhs, auto &rhs) { return lhs.first < rhs.first; });
|
|
|
|
dependentLvlMap[tid][l] = std::move(deps);
|
|
unsigned depends = dependentLvlMap[tid][l].size();
|
|
if (depends == 0)
|
|
continue;
|
|
sliceMeta[tid][l].reserve(depends);
|
|
// We need `depends - 1` slices to fully reduce the affine expression.
|
|
slicePosBuffer[tid][l].reserve(depends - 1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::initializeLoopEmit(
|
|
OpBuilder &builder, Location loc, LoopEmitter::OutputUpdater updater,
|
|
LoopEmitter::SynTensorBoundSetter synSetter) {
|
|
|
|
// For every synthetic tensor, set the high bound by calling the callback.
|
|
if (synSetter)
|
|
for (unsigned i = 0, e = highs[getSynTensorId()].size(); i < e; i++)
|
|
highs[getSynTensorId()][i] = synSetter(builder, loc, i);
|
|
|
|
// For every manifest tensor:
|
|
// * get the values buffer.
|
|
// * For every level:
|
|
// * get the positions and coordinates buffers
|
|
// * get/compute the level-size, which is also used as the upper-bound
|
|
// on positions.
|
|
for (TensorId t = 0, numTensors = getNumManifestTensors(); t < numTensors;
|
|
t++) {
|
|
const Value tensor = tensors[t];
|
|
const auto rtp = dyn_cast<RankedTensorType>(tensor.getType());
|
|
if (!rtp)
|
|
// Skips only scalar, zero ranked tensor still need to be bufferized and
|
|
// (probably) filled with zeros by users.
|
|
continue;
|
|
// FIXME: the definition of `lvlRank` looks more like a dim-rank;
|
|
// but the variable is used as a level everywhere below, which
|
|
// suggests there may be some dim/lvl confusion going on here.
|
|
auto stt = getSparseTensorType(tensor);
|
|
const Level lvlRank = stt.getLvlRank();
|
|
const auto shape = rtp.getShape();
|
|
|
|
SmallVector<Value> lvlSzs;
|
|
for (Level l = 0; l < stt.getLvlRank(); l++) {
|
|
if (stt.hasEncoding())
|
|
lvlSzs.push_back(builder.create<LvlOp>(loc, tensor, l));
|
|
else
|
|
lvlSzs.push_back(builder.create<tensor::DimOp>(loc, tensor, l));
|
|
}
|
|
|
|
// Scan all levels of current tensor.
|
|
for (Level l = 0; l < lvlRank; l++) {
|
|
lvls[t][l] = makeSparseTensorLevel(builder, loc, tensor, l);
|
|
|
|
// Find upper bound in current dimension.
|
|
highs[t][l] = lvlSizes[t][l] = lvlSzs[l];
|
|
if (isSparseSlices[t]) {
|
|
sliceOffsets[t][l] = genSliceOffset(builder, loc, tensors[t], l);
|
|
sliceStrides[t][l] = genSliceStride(builder, loc, tensors[t], l);
|
|
}
|
|
}
|
|
|
|
// Perform the required bufferization. Dense inputs materialize
|
|
// from the input tensors. Sparse inputs use sparse primitives to obtain the
|
|
// values.
|
|
// Delegates extra output initialization to clients.
|
|
bool isOutput = isOutputTensor(t);
|
|
Type elementType = stt.getElementType();
|
|
if (!stt.hasEncoding()) {
|
|
// Non-annotated dense tensors.
|
|
BaseMemRefType denseTp = MemRefType::get(shape, elementType);
|
|
|
|
// TODO: if we unconditionally use fully dynamic layout here, it breaks
|
|
// some vectorization passes which requires static stride = 1.
|
|
// Is it possible to call vectorization pass after bufferization?
|
|
if (llvm::isa_and_nonnull<tensor::ExtractSliceOp>(tensor.getDefiningOp()))
|
|
denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp);
|
|
|
|
Value denseVal =
|
|
builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
|
|
// Dense outputs need special handling.
|
|
if (isOutput && updater)
|
|
denseVal = updater(builder, loc, denseVal, tensor);
|
|
|
|
valBuffer[t] = denseVal;
|
|
} else {
|
|
// Annotated sparse tensors.
|
|
// We also need the value buffer for all-dense annotated "sparse"
|
|
// tensors.
|
|
valBuffer[t] = genToValues(builder, loc, tensor);
|
|
}
|
|
// NOTE: we can also prepare for 0 lvl here in advance, this will hoist
|
|
// some loop preparation from tensor iteration, but will also (undesirably)
|
|
// hoist the code ouside if-conditions.
|
|
}
|
|
|
|
initSliceDriven(builder, loc);
|
|
}
|
|
|
|
void LoopEmitter::initSliceDriven(OpBuilder &builder, Location loc) {
|
|
Value c0 = C_IDX(0);
|
|
for (TensorId t = 0, e = tensors.size(); t < e; t++) {
|
|
auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType());
|
|
if (!rtp)
|
|
continue;
|
|
|
|
Level lvlRank = SparseTensorType(rtp).getLvlRank();
|
|
|
|
// Compute the dependency reduction order.
|
|
auto remDepStack = dependentLvlMap;
|
|
std::vector<std::tuple<LoopId, TensorId, Level>> depRedOrder;
|
|
for (Level lvl = 0; lvl < lvlRank; lvl++) {
|
|
// Reverse queue into a stack.
|
|
std::reverse(remDepStack[t][lvl].begin(), remDepStack[t][lvl].end());
|
|
for (auto [loop, coeff] : dependentLvlMap[t][lvl])
|
|
depRedOrder.emplace_back(std::make_tuple(loop, t, lvl));
|
|
}
|
|
|
|
if (depRedOrder.empty())
|
|
continue;
|
|
std::sort(depRedOrder.begin(), depRedOrder.end(),
|
|
[](auto &l, auto &r) { return std::get<0>(l) < std::get<0>(r); });
|
|
|
|
for (auto [loop, t, lvl] : depRedOrder) {
|
|
std::pair<LoopId, unsigned> curDep = remDepStack[t][lvl].back();
|
|
assert(curDep.first == loop);
|
|
Value size = c0;
|
|
for (auto [loop, stride] : remDepStack[t][lvl]) {
|
|
// The synthetic tensor high defines the loop upper bound.
|
|
Value loopHi = highs[getSynTensorId()][loop];
|
|
size = ADDI(size, MULI(loopHi, C_IDX(stride)));
|
|
}
|
|
sliceMeta[t][lvl].emplace_back(size, curDep.second);
|
|
remDepStack[t][lvl].pop_back();
|
|
|
|
// Generate caches required to fast compute next-non-empty slices with
|
|
// increasing offset for slice-base loop.
|
|
// We do not need cache for dense levels.
|
|
if (!remDepStack[t][lvl].empty() && !isDenseLT(lvls[t][lvl]->getLT())) {
|
|
Value cnt = C_IDX(1);
|
|
for (int preLvl = lvl - 1; preLvl >= 0; preLvl--) {
|
|
if (remDepStack[t][preLvl].empty())
|
|
break;
|
|
assert(remDepStack[t][preLvl].size() == 1 && "Not implemented");
|
|
auto [loop, stride] = remDepStack[t][preLvl].back();
|
|
assert(stride == 1 && "Not yet implemented");
|
|
// Accumlate the size required to cache the pLo for the slice.
|
|
// E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the
|
|
// second level. We at most need a memref<d0xindex>.
|
|
//
|
|
// NOTE: this is apparently an over-approximation when the previous
|
|
// level is compressed, and we can compute a precise memory size
|
|
// inside the loops. But that would also requires us to allocate/free
|
|
// memory in loops.
|
|
cnt = MULI(highs[getSynTensorId()][loop], cnt);
|
|
}
|
|
slicePosBuffer[t][lvl].push_back(allocSlicePosBuf(builder, loc, cnt));
|
|
} // else fully resolved.
|
|
}
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::categorizeLoopCondition(
|
|
ArrayRef<TensorLevel> tidLvls, SmallVectorImpl<TensorLvlCond> &dnConds,
|
|
SmallVectorImpl<TensorLvlCond> &spConds) {
|
|
// Finds out the tensor level that we should use to generate loops. Amongs all
|
|
// the tensor levels, there is at most one sparse tensor level.
|
|
for (auto [t, l] : unpackTensorLevelRange(tidLvls)) {
|
|
assert(lvlTypes[t].size() > l); // Must be a valid tid, dim pair
|
|
auto lvlType = lvlTypes[t][l];
|
|
// Must be a recognizable LT.
|
|
assert(isDenseLT(lvlType) || isCompressedLT(lvlType) ||
|
|
isLooseCompressedLT(lvlType) || isSingletonLT(lvlType) ||
|
|
is2OutOf4LT(lvlType));
|
|
|
|
bool isSparse = !isDenseLT(lvlType);
|
|
bool isSlice = isSparseSlices[t];
|
|
bool isAffine = !dependentLvlMap[t][l].empty();
|
|
bool isUnRedu = false;
|
|
// TODO: Supports affine index expression on sparse tensor slices.
|
|
assert(!isSlice || !isAffine);
|
|
|
|
// Whether the affine index expression has been fully reduced or not.
|
|
if (!dependentLvlMap[t][l].empty())
|
|
isUnRedu = !depFullyReduced(t, l);
|
|
|
|
auto &dstVec = isSparse ? spConds : dnConds;
|
|
dstVec.emplace_back(
|
|
makeTensorLevel(t, l),
|
|
makeLoopCondKind(isSparse, isSlice, isAffine, isUnRedu));
|
|
}
|
|
|
|
std::stable_sort(spConds.begin(), spConds.end(), [](auto lhs, auto rhs) {
|
|
// AffineUnRed > Affine > Slice > Trivial
|
|
return static_cast<uint8_t>(lhs.second) > static_cast<uint8_t>(rhs.second);
|
|
});
|
|
}
|
|
|
|
void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc,
|
|
ArrayRef<TensorLevel> tidLvls) {
|
|
// TODO: sort
|
|
assert(loopSeqStack.size() == loopStack.size());
|
|
// Prepares for all the tensors used in the current loop sequence.
|
|
std::vector<std::tuple<TensorId, Level, bool>> slicedTids;
|
|
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
if (!dependentLvlMap[tid][lvl].empty()) {
|
|
bool fullyRed = genSliceBegin(builder, loc, tid, lvl);
|
|
slicedTids.emplace_back(tid, lvl, fullyRed);
|
|
} else if (!isSynTensor(tid)) {
|
|
prepareLoopOverTensorAtLvl(builder, loc, tid, lvl);
|
|
}
|
|
}
|
|
|
|
// Universal Index starts from 0.
|
|
loopSeqStack.emplace_back(C_IDX(0), std::move(slicedTids));
|
|
}
|
|
|
|
void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) {
|
|
assert(loopSeqStack.size() == loopStack.size() + 1);
|
|
|
|
const auto &slicedTids = loopSeqStack.back().second;
|
|
|
|
// Depending on whether the slice is resolved or not at current loop sequence,
|
|
// end them in different ways.
|
|
for (auto [tid, lvl, res] : slicedTids) {
|
|
if (!res) {
|
|
// If this is a unresolved-slice-driven loop, pops out the slice.
|
|
assert(sliceStack[tid].back().slicedOnLvl == lvl);
|
|
sliceStack[tid].pop_back();
|
|
}
|
|
}
|
|
loopSeqStack.pop_back();
|
|
}
|
|
|
|
Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) {
|
|
switch (a.getKind()) {
|
|
case AffineExprKind::DimId: {
|
|
// FIXME: since the one callsite in Sparsification passes in a
|
|
// level-expression, the `getPosition` must in fact be a `Dimension`.
|
|
// However, elsewhere we have been lead to expect that `loopIdToOrd`
|
|
// should be indexed by `LoopId`...
|
|
const auto loopId = cast<AffineDimExpr>(a).getPosition();
|
|
return loopStack[loopId].iv;
|
|
}
|
|
case AffineExprKind::Add: {
|
|
auto binOp = cast<AffineBinaryOpExpr>(a);
|
|
return ADDI(genAffine(builder, loc, binOp.getLHS()),
|
|
genAffine(builder, loc, binOp.getRHS()));
|
|
}
|
|
case AffineExprKind::Mul: {
|
|
auto binOp = cast<AffineBinaryOpExpr>(a);
|
|
return MULI(genAffine(builder, loc, binOp.getLHS()),
|
|
genAffine(builder, loc, binOp.getRHS()));
|
|
}
|
|
case AffineExprKind::Constant: {
|
|
int64_t c = cast<AffineConstantExpr>(a).getValue();
|
|
return C_IDX(c);
|
|
}
|
|
default:
|
|
llvm_unreachable("unexpected affine subscript");
|
|
}
|
|
}
|
|
|
|
std::pair<Operation *, Value> LoopEmitter::emitForLoopOverTensorAtLvl(
|
|
OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value lo,
|
|
Value hi, MutableArrayRef<Value> reduc, bool isParallel) {
|
|
bool isSparseCond = isCompressedLT(lvlTypes[tid][lvl]) ||
|
|
isLooseCompressedLT(lvlTypes[tid][lvl]) ||
|
|
is2OutOf4LT(lvlTypes[tid][lvl]) ||
|
|
isSingletonLT(lvlTypes[tid][lvl]);
|
|
// TODO: support dynamic slices.
|
|
// Uses the first dimension here to build the loop bound (which is also the
|
|
// biggest range).
|
|
Value step = C_IDX(1);
|
|
Operation *loop = nullptr;
|
|
Value iv;
|
|
if (isParallel) {
|
|
scf::ParallelOp parOp =
|
|
builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
|
|
builder.setInsertionPointToStart(parOp.getBody());
|
|
assert(parOp.getNumReductions() == reduc.size());
|
|
iv = parOp.getInductionVars()[0];
|
|
|
|
// In-place update on the reduction variable vector.
|
|
// Note that the init vals is not the actual reduction variables but instead
|
|
// used as a "special handle" to (temporarily) represent them. The
|
|
// expression on init vals will be moved into scf.reduce and replaced with
|
|
// the block arguments when exiting the loop (see exitForLoop). This is
|
|
// needed as we can not build the actual reduction block and get the actual
|
|
// reduction variable before users fill parallel loop body.
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = parOp.getInitVals()[i];
|
|
loop = parOp;
|
|
} else {
|
|
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
|
|
builder.setInsertionPointToStart(forOp.getBody());
|
|
iv = forOp.getInductionVar();
|
|
|
|
// In-place update on the reduction variable vector.
|
|
assert(forOp.getNumRegionIterArgs() == reduc.size());
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = forOp.getRegionIterArg(i);
|
|
loop = forOp;
|
|
}
|
|
assert(loop && iv);
|
|
|
|
Value crd;
|
|
if (isSparseCond) {
|
|
// For COO, the position is the same across consecutive levels.
|
|
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
|
|
posits[tid][lvl] = iv;
|
|
crd = genSparseCrd(builder, loc, tid, lvl);
|
|
} else {
|
|
// Dense tensor, the coordinate is the inducation variable.
|
|
crd = iv;
|
|
}
|
|
|
|
if (isSparseSlices[tid] && isSparseCond) {
|
|
// For sparse level slices, we need to filter out invalid coordinates that
|
|
// are not included in the slice.
|
|
SmallVector<Type> types;
|
|
for (Value red : reduc)
|
|
types.push_back(red.getType());
|
|
|
|
auto [trans, pred] = genSliceLegitPredicate(builder, loc, crd, tid, lvl);
|
|
bool hasReduc = !types.empty();
|
|
scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, pred,
|
|
/*else*/ hasReduc);
|
|
if (hasReduc) {
|
|
// scf.for (a) -> v
|
|
// %s = scf.if (a) -> v
|
|
// user-generated code.
|
|
// else
|
|
// yield a
|
|
// yield %s
|
|
YIELD(ifOp.getResults());
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
// On mismatch.
|
|
YIELD(reduc);
|
|
}
|
|
// Set the insertion point to matched branch.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
crd = trans;
|
|
}
|
|
|
|
assert(crd);
|
|
coords[tid][lvl] = crd;
|
|
return {loop, crd};
|
|
}
|
|
|
|
Value LoopEmitter::genWhileLoopConditions(OpBuilder &builder, Location loc,
|
|
ValueRange ivs, TensorLvlCond cond) {
|
|
auto [tid, lvl] = unpackTensorLevel(cond.first);
|
|
|
|
switch (cond.second) {
|
|
case LoopCondKind::SparseCond: {
|
|
assert(ivs.size() == 1);
|
|
// We used the first level bound as the bound the collapsed set of levels.
|
|
return CMPI(ult, ivs.back(), highs[tid][lvl]);
|
|
}
|
|
case LoopCondKind::SparseSliceCond: {
|
|
assert(ivs.size() == 1);
|
|
return CMPI(ult, ivs.back(), highs[tid][lvl]);
|
|
}
|
|
case LoopCondKind::SparseAffineCond: {
|
|
assert(ivs.size() == 1);
|
|
|
|
Value crdHi; // loop upper bound
|
|
{
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
Operation *loop = builder.getInsertionBlock()->getParentOp();
|
|
// crdHi is a loop invariant, hosit the computation outside the loop.
|
|
if (llvm::isa_and_nonnull<scf::WhileOp>(loop))
|
|
builder.setInsertionPoint(loop);
|
|
auto [remSz, stride] = sliceMeta[tid][lvl].back();
|
|
assert(stride == 1 && "Not yet implemented");
|
|
crdHi = ADDI(getMostRecentSliceOnLvl(tid, lvl).offset, remSz);
|
|
}
|
|
assert(crdHi);
|
|
return genSparseReducedAffineCond(builder, loc, *lvls[tid][lvl], crdHi,
|
|
ivs[0], highs[tid][lvl]);
|
|
}
|
|
case LoopCondKind::SparseAffineUnRedCond: {
|
|
assert(ivs.size() == 3);
|
|
return ivs.front(); // isNonEmpty
|
|
}
|
|
default:
|
|
llvm_unreachable("Unhandled LoopCondKind");
|
|
}
|
|
llvm_unreachable("Unhandled LoopCondKind");
|
|
}
|
|
|
|
std::optional<Value> LoopEmitter::genWhileLoopBody(OpBuilder &builder,
|
|
Location loc, ValueRange ivs,
|
|
TensorLvlCond cond) {
|
|
auto [tid, lvl] = unpackTensorLevel(cond.first);
|
|
|
|
switch (cond.second) {
|
|
case LoopCondKind::SparseCond: {
|
|
// Updates position. For collapsed COO, the position is the same across
|
|
// consecutive levels.
|
|
posits[tid][lvl] = ivs.back();
|
|
|
|
// Update coordinates.
|
|
coords[tid][lvl] = genSparseCrd(builder, loc, tid, lvl);
|
|
return std::nullopt;
|
|
}
|
|
case LoopCondKind::SparseSliceCond: {
|
|
assert(ivs.size() == 1);
|
|
posits[tid][lvl] = ivs.front();
|
|
Value sCrd = genSparseCrd(builder, loc, tid, lvl);
|
|
// Converts the coordinate loaded from the actual sparse tensor to the
|
|
// coordinates in the sparse slice.
|
|
auto [dCrd, pred] = genSliceLegitPredicate(builder, loc, sCrd, tid, lvl);
|
|
coords[tid][lvl] = dCrd;
|
|
return pred;
|
|
}
|
|
case LoopCondKind::SparseAffineCond: {
|
|
assert(ivs.size() == 1);
|
|
// Coord is the relative offset related to its parents.
|
|
assert(sliceStack[tid].back().depth == 1 && "TODO: not yet implement");
|
|
sliceTupleFwdCnt[tid][lvl] = SUBI(ivs[0], posits[tid][lvl]);
|
|
// Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1]
|
|
Value posit = ivs[0];
|
|
// We need to substract the offset to get relative coordinates.
|
|
// TODO: Maybe assert relC >=0 during runtime in debug build?
|
|
Value absC = lvls[tid][lvl]->peekCrdAt(builder, loc, posit);
|
|
auto relC = SUBI(absC, getFinalSliceOnLvl(tid, lvl).offset);
|
|
posits[tid][lvl] = posit;
|
|
coords[tid][lvl] = relC;
|
|
return std::nullopt;
|
|
}
|
|
case LoopCondKind::SparseAffineUnRedCond: {
|
|
unsigned depth = sliceStack[tid].back().depth;
|
|
unsigned curStride = sliceMeta[tid][lvl][depth - 1].second;
|
|
assert(ivs.size() == 3);
|
|
|
|
// Updates the current slice info
|
|
SliceInfo &sliceInfo = sliceStack[tid].back();
|
|
sliceInfo.isNonEmpty = ivs[0];
|
|
sliceInfo.minCrd = ivs[1];
|
|
sliceInfo.offset = ivs[2];
|
|
|
|
// Crd (the value we used to coiterate) is the relative offset related to
|
|
// its parents, we can use the absolute offset here because when depth = 1,
|
|
// absOffset[lvl][depth - 1] always equals zero.
|
|
// TODO: Update crd =absOffset[lvl][depth] - absOffset[lvl][depth - 1]
|
|
assert(depth == 1 && "TODO: not yet implement");
|
|
Value crd = sliceInfo.offset;
|
|
|
|
Value onStride = constantI1(builder, loc, true);
|
|
if (curStride != 1) {
|
|
Value strideVal = C_IDX(curStride);
|
|
Value rem = REMUI(crd, strideVal);
|
|
crd = DIVUI(crd, strideVal);
|
|
onStride = CMPI(eq, rem, C_IDX(0));
|
|
}
|
|
coords[tid][lvl] = crd;
|
|
// No extra check is needed before accessing the tensor level.
|
|
return onStride;
|
|
}
|
|
default:
|
|
llvm_unreachable("Unhandled LoopCondKind");
|
|
}
|
|
llvm_unreachable("Unhandled LoopCondKind");
|
|
}
|
|
|
|
ValueRange LoopEmitter::genCheckedValue(OpBuilder &builder, Location loc,
|
|
Value pred, ValueRange curArgs,
|
|
TensorLvlCond cond) {
|
|
assert(isSparseCond(cond.second));
|
|
auto [tid, lvl] = unpackTensorLevel(cond.first);
|
|
if (isAffineIdxUnRedCond(cond.second)) {
|
|
unsigned depth = sliceStack[tid].back().depth;
|
|
unsigned curStride = sliceMeta[tid][lvl][depth - 1].second;
|
|
if (curStride == 1)
|
|
return curArgs;
|
|
// Build
|
|
// if (onStride) {
|
|
// yield curSlice
|
|
// } else {
|
|
// yield nxSlice.
|
|
//}
|
|
assert(curArgs.size() == 3);
|
|
auto ifOp = builder.create<scf::IfOp>(loc, curArgs.getTypes(), pred, true);
|
|
{
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// If not all slices are legit, yield the updated value.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
|
|
YIELD(curArgs);
|
|
// If not all slices are legit, yield the updated value.
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
auto [nonEmpty, minCrd, offset] =
|
|
genSliceNextInduction(builder, loc, tid, lvl);
|
|
SmallVector<Value> nxSlice{nonEmpty, minCrd, offset};
|
|
YIELD(nxSlice);
|
|
}
|
|
// If all slices are legit, start the user generated code.
|
|
return ifOp.getResults();
|
|
} else {
|
|
// Currently only sparse slice condition need extra check.
|
|
assert(isSliceCond(cond.second) && isSparseCond(cond.second));
|
|
assert(curArgs.size() == 1);
|
|
Value nextPos = ADDI(curArgs.front(), C_IDX(1));
|
|
return SELECT(pred, curArgs.front(), nextPos)->getResults();
|
|
}
|
|
llvm_unreachable("unhandled case");
|
|
}
|
|
|
|
std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
|
|
OpBuilder &builder, Location loc, ArrayRef<TensorLvlCond> spConds,
|
|
MutableArrayRef<Value> reduc, bool needsUniv) {
|
|
// NOTE: the slice driven tensor-related reduction variable must
|
|
// appear before normal tensors.
|
|
assert(!spConds.empty());
|
|
|
|
// The set of induction variables for the while loop.
|
|
SmallVector<Value> ivs;
|
|
// Segment sizes for induction variables used for different kinds of loop
|
|
// conditions.
|
|
SmallVector<unsigned> opSegSize;
|
|
|
|
// Construct the while-loop with a parameter for each coordinate.
|
|
for (auto [tl, cKind] : spConds) {
|
|
auto [tid, lvl] = unpackTensorLevel(tl);
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
// Dense level are handled by the shared univeral index.
|
|
assert(!isDenseCond(cKind));
|
|
// Must be a recognizable sparse level.
|
|
assert(isCompressedLT(lvlTp) || isLooseCompressedLT(lvlTp) ||
|
|
isSingletonLT(lvlTp));
|
|
(void)lvlTp;
|
|
|
|
unsigned prevSz = ivs.size();
|
|
if (isAffineIdxCond(cKind)) {
|
|
// TODO: Support view-based reshape on sparse levels with affine index
|
|
// expressions.
|
|
if (isAffineIdxUnRedCond(cKind)) {
|
|
SliceInfo &sliceInfo = sliceStack[tid].back();
|
|
// The order matters!
|
|
ivs.push_back(sliceInfo.isNonEmpty);
|
|
ivs.push_back(sliceInfo.minCrd);
|
|
ivs.push_back(sliceInfo.offset);
|
|
} else {
|
|
ivs.push_back(posits[tid][lvl]); // loop lower bound (pos low).
|
|
}
|
|
// We reduced one more dependency after entering the loop.
|
|
levelReducedDep[tid][lvl]++;
|
|
} else {
|
|
assert(dependentLvlMap[tid][lvl].empty());
|
|
const Value pos = posits[tid][lvl];
|
|
ivs.push_back(pos);
|
|
}
|
|
opSegSize.push_back(ivs.size() - prevSz);
|
|
}
|
|
|
|
// The position where user-supplied reduction variable starts.
|
|
ivs.append(reduc.begin(), reduc.end());
|
|
// Update universal index.
|
|
if (needsUniv)
|
|
ivs.push_back(loopSeqStack.back().first);
|
|
|
|
// Ensures all operands are valid.
|
|
assert(llvm::all_of(ivs, [](Value v) { return v != nullptr; }));
|
|
TypeRange types = ValueRange(ivs).getTypes();
|
|
auto whileOp = builder.create<scf::WhileOp>(loc, types, ivs);
|
|
|
|
SmallVector<Location> locs(types.size(), loc);
|
|
Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
|
|
Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
|
|
|
|
// Generates loop conditions.
|
|
builder.setInsertionPointToStart(before);
|
|
ValueRange bArgs = before->getArguments();
|
|
Value whileCond = nullptr; // bool values for loop condition.
|
|
for (auto [c, segSz] : llvm::zip_equal(spConds, opSegSize)) {
|
|
Value cv = genWhileLoopConditions(builder, loc, bArgs.take_front(segSz), c);
|
|
bArgs = bArgs.drop_front(segSz);
|
|
whileCond = !whileCond ? cv : ANDI(whileCond, cv);
|
|
}
|
|
// The remaining block arguments are user-provided reduction values and an
|
|
// optional universal index. Make sure their sizes match.
|
|
assert(bArgs.size() == reduc.size() + needsUniv ? 1 : 0);
|
|
builder.create<scf::ConditionOp>(loc, whileCond, before->getArguments());
|
|
|
|
// Generates loop body.
|
|
builder.setInsertionPointToStart(after);
|
|
ValueRange aArgs = after->getArguments();
|
|
// Since some LoopCondKind might need extra checks to filter out invalid
|
|
// iterations, we maintains another array to hold the iteration arguments to
|
|
// yield if the checks fails.
|
|
SmallVector<Value> nextArgs(aArgs.begin(), aArgs.end());
|
|
// A mutable alias for convenient slicing.
|
|
MutableArrayRef<Value> nextArgsRef = nextArgs;
|
|
Value extraPred = nullptr;
|
|
for (auto [c, segSz] : llvm::zip_equal(spConds, opSegSize)) {
|
|
ValueRange condArgs = aArgs.take_front(segSz);
|
|
auto pred = genWhileLoopBody(builder, loc, condArgs, c);
|
|
assert(pred.has_value() == isCondWithExtraCheck(c.second));
|
|
if (pred.has_value()) {
|
|
// We need all extra checks to pass.
|
|
extraPred = extraPred == nullptr ? *pred : ANDI(*pred, extraPred);
|
|
ValueRange nxArgs = genCheckedValue(builder, loc, *pred, condArgs, c);
|
|
assert(nxArgs.size() == segSz);
|
|
// Update the value for cases when some check fails.
|
|
for (unsigned i = 0; i < segSz; i++) {
|
|
nextArgsRef[i] = nxArgs[i];
|
|
}
|
|
}
|
|
aArgs = aArgs.drop_front(segSz);
|
|
nextArgsRef = nextArgsRef.drop_front(segSz);
|
|
}
|
|
|
|
if (extraPred) {
|
|
auto ifOp = builder.create<scf::IfOp>(loc, types, extraPred, /*else*/ true);
|
|
// Marks this special IfOp so that Sparsification does not finalizing it.
|
|
ifOp->setAttr(getLoopEmitterLoopAttrName(),
|
|
StringAttr::get(builder.getContext(), "slice"));
|
|
// Links the SSA chain outside the if statement.
|
|
YIELD(ifOp->getResults());
|
|
|
|
// If not all slices are legit, yield the updated value.
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
YIELD(nextArgs);
|
|
|
|
// If all slices are legit, start the user generated code.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
}
|
|
|
|
for (auto [tid, lvl] : unpackTensorLevelFromCondRange(spConds)) {
|
|
// Generates segment high for non-unique level.
|
|
if (!isUniqueLT(lvlTypes[tid][lvl])) {
|
|
segHi[tid][lvl] = genSegmentHigh(builder, loc, tid, lvl, posits[tid][lvl],
|
|
highs[tid][lvl]);
|
|
}
|
|
}
|
|
|
|
// In-place update on reduction variable.
|
|
assert(aArgs.size() == reduc.size() + needsUniv ? 1 : 0);
|
|
for (unsigned i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = aArgs[i];
|
|
|
|
Value min;
|
|
// Finds the minimum coordinate
|
|
if (!needsUniv) {
|
|
for (auto [tid, lvl] : unpackTensorLevelFromCondRange(spConds)) {
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
if (isCompressedLT(lvlTp) || isSingletonLT(lvlTp) ||
|
|
isLooseCompressedLT(lvlTp)) {
|
|
const auto crd = coords[tid][lvl];
|
|
if (min) {
|
|
Value cmp = CMPI(ult, coords[tid][lvl], min);
|
|
min = SELECT(cmp, coords[tid][lvl], min);
|
|
} else {
|
|
min = crd;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
assert(!min);
|
|
// Otherwise, universal index is the minimal pos.
|
|
min = whileOp.getAfterArguments().back();
|
|
}
|
|
|
|
return {whileOp, min};
|
|
}
|
|
|
|
bool LoopEmitter::shouldIteratedByForLoop(ArrayRef<TensorLvlCond> sparseConds,
|
|
bool genDedup) {
|
|
assert(llvm::all_of(sparseConds,
|
|
[](TensorLvlCond c) { return isSparseCond(c.second); }));
|
|
|
|
// If we need to co-iterate over two sparse tensors, we need a while loop
|
|
if (sparseConds.size() > 1)
|
|
return false;
|
|
|
|
// We also need a while loop for levels with affine index expression and
|
|
// non-unique levels when deduplication is required.
|
|
if (sparseConds.size() == 1) {
|
|
auto [tid, lvl] = unpackTensorLevel(sparseConds.back().first);
|
|
return !isAffineIdxCond(sparseConds.back().second) &&
|
|
!(genDedup && !isUniqueLT(lvlTypes[tid][lvl]));
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls(
|
|
OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls,
|
|
MutableArrayRef<Value> reduc, bool tryParallel, bool genDedup,
|
|
bool needsUniv) {
|
|
#ifndef NDEBUG
|
|
// Sanity checks.
|
|
assert(!tidLvls.empty());
|
|
for (auto [t, l] : unpackTensorLevelRange(tidLvls)) {
|
|
assert(!coords[t][l] || // We cannot re-enter the same level
|
|
!dependentLvlMap[t][l].empty()); // unless it is a slice-driver loop
|
|
}
|
|
#endif
|
|
// TODO: support multiple return on parallel for?
|
|
tryParallel = tryParallel && reduc.size() <= 1;
|
|
|
|
SmallVector<TensorLvlCond> spConds;
|
|
SmallVector<TensorLvlCond> dnConds;
|
|
categorizeLoopCondition(tidLvls, dnConds, spConds);
|
|
|
|
// Only when there is at least one sparse conditions, do we really need the
|
|
// universal index.
|
|
// TODO: Maybe we should instead requires merger to pass in a valid value at
|
|
// the first place instead of adjusting it in LoopEmitter?
|
|
needsUniv = !spConds.empty() && needsUniv;
|
|
// The TensorLevel used for loop conditions.
|
|
// If there is any sparse level, we need to use the sparse condition.
|
|
// If all levels are dense, we can pick arbitrary one (dense slice-driven loop
|
|
// can be generated using a simple ForOp as well).
|
|
Operation *l = nullptr;
|
|
Value iv = nullptr;
|
|
SmallVector<SliceLoopInfo> sliceDrivenInfo;
|
|
SmallVector<TensorLevel> trivialLvls;
|
|
|
|
// Generates loops differently depending on whether we need a slice-driven
|
|
// loop or a simple level traversal loop.
|
|
if (shouldIteratedByForLoop(spConds, genDedup) && !needsUniv) {
|
|
assert(spConds.size() <= 1);
|
|
TensorLvlCond tlCond = spConds.empty() ? dnConds.front() : spConds.front();
|
|
auto loopCondKind = tlCond.second;
|
|
auto [tid, lvl] = unpackTensorLevel(tlCond.first);
|
|
Value lo = isSparseCond(loopCondKind)
|
|
? posits[tid][lvl] // current offset
|
|
: loopSeqStack.back().first; // universal index
|
|
Value hi = highs[tid][lvl];
|
|
if (isDenseCond(loopCondKind) && isAffineIdxCond(loopCondKind)) {
|
|
bool unReduc = isAffineIdxUnRedCond(loopCondKind);
|
|
assert(unReduc == !depFullyReduced(tid, lvl));
|
|
unsigned depth = sliceStack[tid].back().depth;
|
|
assert(depth >= 1);
|
|
// The *next* slice size after reducing the current index variable.
|
|
auto [nxSz, nxStride] = sliceMeta[tid][lvl][depth];
|
|
// The *current* stride to reduce the current index variable.
|
|
// E.g., for 2 * i, stride = 2.
|
|
unsigned stride = sliceMeta[tid][lvl][depth - 1].second;
|
|
hi = nxSz;
|
|
if (unReduc) {
|
|
// Adjust for loop hi for dense slice-driven loop.
|
|
hi = SUBI(lvlSizes[tid][lvl], hi);
|
|
hi = ADDI(hi, C_IDX(1));
|
|
hi = DIVUI(hi, C_IDX(stride));
|
|
} else {
|
|
// TODO: dialuted convolution.
|
|
assert(nxStride == 1 && "Not yet implemented.");
|
|
}
|
|
}
|
|
std::tie(l, iv) = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, lo, hi,
|
|
reduc, tryParallel);
|
|
// For loop condition must be a trivial condition (levels without affine
|
|
// index expression).
|
|
trivialLvls.push_back(tlCond.first);
|
|
} else {
|
|
for (auto [tl, cKind] : spConds) {
|
|
if (isAffineIdxCond(cKind)) {
|
|
auto [tid, lvl] = unpackTensorLevel(tl);
|
|
bool unReduc = isAffineIdxUnRedCond(cKind);
|
|
assert(unReduc == !depFullyReduced(tid, lvl));
|
|
sliceDrivenInfo.emplace_back(tid, lvl, /*fullyReduced=*/!unReduc);
|
|
} else {
|
|
trivialLvls.push_back(tl);
|
|
}
|
|
}
|
|
|
|
std::tie(l, iv) =
|
|
emitWhileLoopOverTensorsAtLvls(builder, loc, spConds, reduc, needsUniv);
|
|
}
|
|
|
|
// Enter dense tensor levels.
|
|
enterTensorsAtDenseLvls(builder, loc, dnConds, iv, sliceDrivenInfo);
|
|
// NOTE: we can also prepare for next dim here in advance
|
|
|
|
// Pushes the loop into stack.
|
|
loopStack.emplace_back(trivialLvls, sliceDrivenInfo, l,
|
|
builder.getInsertionBlock(), iv, loopTag);
|
|
return l;
|
|
}
|
|
|
|
Operation *LoopEmitter::enterFilterLoopOverTensorAtLvl(
|
|
OpBuilder &builder, Location loc, TensorId tid, Level lvl,
|
|
AffineExpr affine, MutableArrayRef<Value> reduc) {
|
|
assert(isValidLevel(tid, lvl));
|
|
assert(!isa<AffineDimExpr>(affine) && !isDenseLT(lvlTypes[tid][lvl]));
|
|
// We can not re-enter the same level.
|
|
assert(!coords[tid][lvl]);
|
|
|
|
// TODO: We should instead use a whileOp for filter loop to allow early
|
|
// break when exceeding (for ordered levels).
|
|
// TODO: There are many other potiential opportunities that we might apply in
|
|
// the future. E.g., we could use binary search to locate positions.
|
|
const Value step = C_IDX(1);
|
|
const Value pLo = posits[tid][lvl];
|
|
const Value pHi = highs[tid][lvl];
|
|
scf::ForOp forOp = builder.create<scf::ForOp>(loc, pLo, pHi, step, reduc);
|
|
|
|
// In-place update on the reduction variable vector.
|
|
assert(forOp.getNumRegionIterArgs() == reduc.size());
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = forOp.getRegionIterArg(i);
|
|
|
|
builder.setInsertionPointToStart(forOp.getBody());
|
|
// The induction variable gives the position.
|
|
const Value pos = forOp.getInductionVar();
|
|
posits[tid][lvl] = pos;
|
|
const Value crd = lvls[tid][lvl]->peekCrdAt(builder, loc, pos);
|
|
coords[tid][lvl] = crd;
|
|
|
|
// Generate an if-condition to filter out coordinates that are not
|
|
// equal to the result of the affine expression.
|
|
Value expected = genAffine(builder, loc, affine);
|
|
auto pred = CMPI(eq, crd, expected);
|
|
SmallVector<Type> types;
|
|
for (Value red : reduc) {
|
|
types.push_back(red.getType());
|
|
}
|
|
|
|
bool hasReduc = !types.empty();
|
|
scf::IfOp ifOp =
|
|
builder.create<scf::IfOp>(loc, types, pred, /*else*/ hasReduc);
|
|
if (hasReduc) {
|
|
// scf.for (a) -> v
|
|
// %s = scf.if (a) -> v
|
|
// user-generated code.
|
|
// else
|
|
// yield a
|
|
// yield %s
|
|
YIELD(ifOp.getResults());
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
// On mismatch.
|
|
YIELD(reduc);
|
|
}
|
|
// Set the insert point to matched branch.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
|
|
// NOTE: we can also prepare for next lvl here in advance
|
|
// Push the loop into stack
|
|
loopStack.emplace_back(ArrayRef<TensorLevel>(makeTensorLevel(tid, lvl)),
|
|
ArrayRef<SliceLoopInfo>(), forOp,
|
|
builder.getInsertionBlock(), coords[tid][lvl],
|
|
nullptr);
|
|
return forOp;
|
|
}
|
|
|
|
void LoopEmitter::genDenseAffineAddress(OpBuilder &builder, Location loc,
|
|
TensorLevel tidLvl,
|
|
AffineExpr lvlExpr) {
|
|
auto [tid, lvl] = unpackTensorLevel(tidLvl);
|
|
assert(isDenseLT(lvlTypes[tid][lvl]));
|
|
// For dense levels, the vel-coordinate also serves as the position.
|
|
Value lvlCrd = genAffine(builder, loc, lvlExpr);
|
|
posits[tid][lvl] = genAddress(builder, loc, tid, lvl, lvlCrd);
|
|
}
|
|
|
|
void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
assert(isValidLevel(tid, lvl));
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
|
|
if (isDenseLT(lvlTp))
|
|
return;
|
|
|
|
const Value c0 = C_IDX(0);
|
|
const Value c1 = C_IDX(1);
|
|
// Either the first level, or the previous level has been set.
|
|
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
|
|
assert(lvl == 0 || posits[tid][lvl - 1]);
|
|
if (isCompressedLT(lvlTp) || isLooseCompressedLT(lvlTp) ||
|
|
is2OutOf4LT(lvlTp)) {
|
|
|
|
Value pos = lvl == 0 ? c0 : posits[tid][lvl - 1];
|
|
std::tie(posits[tid][lvl], highs[tid][lvl]) =
|
|
lvls[tid][lvl]->peekRangeAt(builder, loc, pos);
|
|
return;
|
|
}
|
|
if (isSingletonLT(lvlTp)) {
|
|
// TODO: merge this as well when SparseTensorLevel support dedup.
|
|
const Value pLo = lvl == 0 ? c0 : posits[tid][lvl - 1];
|
|
posits[tid][lvl] = pLo;
|
|
|
|
// If we are coiterating non-unique levels, then use pHi=segHi;
|
|
// otherwise use pHi=pLo+1.
|
|
// NOTE: Just because the level is non-unique, that does not
|
|
// guarantee that segHi is defined: because we only generate segHi
|
|
// whenever coiterating, in order to improve code quality for the
|
|
// non-coiterating cases.
|
|
const auto parentSegHi = segHi[tid][lvl - 1];
|
|
highs[tid][lvl] = (!isUniqueLT(lvlTypes[tid][lvl - 1]) && parentSegHi)
|
|
? parentSegHi
|
|
: ADDI(pLo, c1);
|
|
return;
|
|
}
|
|
llvm_unreachable("Unrecognized level-type!");
|
|
}
|
|
|
|
void LoopEmitter::enterTensorsAtDenseLvls(
|
|
OpBuilder &builder, Location loc, ArrayRef<TensorLvlCond> dnConds, Value iv,
|
|
SmallVectorImpl<SliceLoopInfo> &sliceInfo) {
|
|
for (auto [dnTidLvl, denseLoopCond] : dnConds) {
|
|
auto [tid, lvl] = unpackTensorLevel(dnTidLvl);
|
|
assert(isDenseLT(lvlTypes[tid][lvl]));
|
|
|
|
if (isAffineIdxCond(denseLoopCond)) {
|
|
// Pushes sliced levels to build correct LoopInfo.
|
|
bool unReduc = isAffineIdxUnRedCond(denseLoopCond);
|
|
SliceInfo &info = sliceStack[tid].back();
|
|
// Pushes sliced dense loop info to tell LoopEmitter how to exit it.
|
|
sliceInfo.emplace_back(tid, lvl, /*fullyReduced=*/!unReduc);
|
|
// FIXME: The offset and position iterator need to be adjusted when the
|
|
// slice is strided.
|
|
if (unReduc) {
|
|
assert(*info.slicedOnLvl == lvl);
|
|
unsigned depth = sliceStack[tid].back().depth;
|
|
assert(depth >= 1);
|
|
unsigned stride = sliceMeta[tid][lvl][depth - 1].second;
|
|
// Update the slice information as we enter the new loop.
|
|
info.minCrd = info.offset = MULI(iv, C_IDX(stride));
|
|
info.isNonEmpty = constantI1(builder, loc, true);
|
|
} else {
|
|
posits[tid][lvl] =
|
|
genAddress(builder, loc, tid, lvl, ADDI(info.offset, iv));
|
|
Value fwdCnt = lvl == 0 || trivialSlice[tid][lvl]
|
|
? C_IDX(0)
|
|
: sliceTupleFwdCnt[tid][lvl - 1];
|
|
Value sz = sliceMeta[tid][lvl].back().first;
|
|
Value mul = MULI(fwdCnt, sz);
|
|
sliceTupleFwdCnt[tid][lvl] = ADDI(mul, iv);
|
|
}
|
|
levelReducedDep[tid][lvl]++;
|
|
} else {
|
|
// Skips the synthetic tensor
|
|
if (isSynTensor(tid))
|
|
continue;
|
|
// A dense level with trivial index expression.
|
|
assert(dependentLvlMap[tid][lvl].empty());
|
|
auto enc = getSparseTensorEncoding(tensors[tid].getType());
|
|
if (enc && !isSparseOutput(tid)) {
|
|
bool validPos = lvl == 0 || posits[tid][lvl - 1];
|
|
if (!validPos) {
|
|
// We might not find the pos for the sparse output tensor as it is
|
|
// unconditionally required by the sparsification.
|
|
assert(isOutputTensor(tid));
|
|
continue;
|
|
}
|
|
posits[tid][lvl] = genAddress(builder, loc, tid, lvl, iv);
|
|
// NOTE: we can also prepare for next lvl here in advance
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
for (auto [tid, lvl, reduced] : loopInfo.sliceDrivenInfo) {
|
|
if (!reduced) {
|
|
SliceInfo &info = sliceStack[tid].back();
|
|
assert(isDenseLT(lvlTypes[tid][lvl]));
|
|
assert(*info.slicedOnLvl == lvl);
|
|
(void)reduced;
|
|
info.minCrd = info.offset = info.isNonEmpty = Value();
|
|
}
|
|
levelReducedDep[tid][lvl]--;
|
|
}
|
|
if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) {
|
|
if (!reduc.empty()) {
|
|
assert(reduc.size() == forOp.getNumResults());
|
|
rewriter.create<scf::YieldOp>(loc, reduc);
|
|
}
|
|
// Exit the loop.
|
|
rewriter.setInsertionPointAfter(forOp);
|
|
// In-place update reduction variables.
|
|
for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
|
|
reduc[i] = forOp.getResult(i);
|
|
} else {
|
|
auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
|
|
if (!reduc.empty()) {
|
|
assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
|
|
Operation *redExp = reduc.front().getDefiningOp();
|
|
// Reduction expression should have no use.
|
|
assert(redExp->getUses().empty());
|
|
// This must be a binary operation.
|
|
// NOTE: This is users' responsibility to ensure the operation are
|
|
// commutative.
|
|
assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
|
|
|
|
Value redVal = parOp.getInitVals().front();
|
|
Value curVal;
|
|
if (redExp->getOperand(0) == redVal)
|
|
curVal = redExp->getOperand(1);
|
|
else if (redExp->getOperand(1) == redVal)
|
|
curVal = redExp->getOperand(0);
|
|
// One of the operands must be the init value (which is also the
|
|
// previous reduction value).
|
|
assert(curVal);
|
|
#ifndef NDEBUG
|
|
// The reduction expression should be the only user of the reduction val
|
|
// inside the parallel for.
|
|
unsigned numUsers = 0;
|
|
for (Operation *op : redVal.getUsers()) {
|
|
if (op->getParentOp() == parOp)
|
|
numUsers++;
|
|
}
|
|
assert(numUsers == 1);
|
|
#endif // NDEBUG
|
|
|
|
rewriter.setInsertionPointAfter(redExp);
|
|
auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
|
|
// Attach to the reduction op.
|
|
Block *redBlock = &redOp.getReductions().front().front();
|
|
rewriter.setInsertionPointToEnd(redBlock);
|
|
Operation *newRed = rewriter.clone(*redExp);
|
|
// Replaces arguments of the reduction expression by using the block
|
|
// arguments from scf.reduce.
|
|
rewriter.updateRootInPlace(
|
|
newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
|
|
// Erases the out-dated reduction expression.
|
|
rewriter.eraseOp(redExp);
|
|
rewriter.setInsertionPointToEnd(redBlock);
|
|
rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
|
|
}
|
|
rewriter.setInsertionPointAfter(parOp);
|
|
// In-place update reduction variables.
|
|
for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
|
|
reduc[i] = parOp.getResult(i);
|
|
}
|
|
|
|
// Finished iterating a tensor, clean up
|
|
// We only do the clean up on for loop as while loops do not necessarily
|
|
// finish the iteration on a sparse tensor
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.trivialTidLvls)) {
|
|
// Reset to null.
|
|
coords[tid][lvl] = Value();
|
|
posits[tid][lvl] = Value();
|
|
// Dense level, high is fixed.
|
|
if (!isDenseLT(lvlTypes[tid][lvl]))
|
|
highs[tid][lvl] = Value();
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop);
|
|
Value iv = loopInfo.iv;
|
|
Value one = C_IDX(1);
|
|
|
|
// Finalize the induction. Note that the induction could be performed
|
|
// in the individual if-branches to avoid re-evaluating the conditions.
|
|
// However, that would result in a rather elaborate forest of yield
|
|
// instructions during code generation. Moreover, performing the induction
|
|
// after the if-statements more closely resembles code generated by TACO.
|
|
unsigned o = 0;
|
|
SmallVector<Value> operands;
|
|
unsigned delta = 0;
|
|
for (auto [tid, lvl, resolved] : loopInfo.sliceDrivenInfo) {
|
|
// TODO: handle dense.
|
|
assert(isCompressedLT(lvlTypes[tid][lvl]));
|
|
levelReducedDep[tid][lvl]--;
|
|
if (!resolved) {
|
|
// TODO: support coiterating multiple slices
|
|
assert(loopInfo.sliceDrivenInfo.size() == 1);
|
|
auto [nxNonEmpty, nxMinCrd, nxAbsOffset] =
|
|
genSliceNextInduction(builder, loc, tid, lvl);
|
|
// Update while loop induction operands.
|
|
operands.push_back(nxNonEmpty);
|
|
operands.push_back(nxMinCrd);
|
|
operands.push_back(nxAbsOffset);
|
|
|
|
// Update the slice stack.
|
|
SliceInfo &info = sliceStack[tid].back();
|
|
info.isNonEmpty = whileOp.getResult(o++);
|
|
info.minCrd = whileOp.getResult(o++);
|
|
info.offset = whileOp.getResult(o++);
|
|
continue;
|
|
}
|
|
|
|
Value forwarded = nullptr;
|
|
if (loopInfo.trivialTidLvls.empty() &&
|
|
loopInfo.sliceDrivenInfo.size() == 1) {
|
|
// Forwards the position iterator.
|
|
operands.push_back(ADDI(posits[tid][lvl], one));
|
|
forwarded = constantI1(builder, loc, true);
|
|
} else {
|
|
const Value pos = posits[tid][lvl];
|
|
const Value nxPos = ADDI(posits[tid][lvl], one);
|
|
forwarded = CMPI(eq, coords[tid][lvl], iv);
|
|
operands.push_back(SELECT(forwarded, nxPos, pos));
|
|
}
|
|
// The coordinate is invalid now.
|
|
coords[tid][lvl] = nullptr;
|
|
|
|
// Update the position iterator as we exit the while loop.
|
|
posits[tid][lvl] = whileOp->getResult(o++);
|
|
};
|
|
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.trivialTidLvls)) {
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
if (isCompressedLT(lvlTp) || isSingletonLT(lvlTp) ||
|
|
isLooseCompressedLT(lvlTp)) {
|
|
const Value crd = coords[tid][lvl];
|
|
const Value pos = posits[tid][lvl];
|
|
Value cmp = CMPI(eq, crd, iv);
|
|
// If the loop contains a coiteration with non-unique level, we fast
|
|
// forward all the duplicated coords by setting the position to the
|
|
// segment high.
|
|
Value add =
|
|
!isUniqueLT(lvlTypes[tid][lvl]) ? segHi[tid][lvl] : ADDI(pos, one);
|
|
|
|
operands.push_back(SELECT(cmp, add, pos));
|
|
// Following loops continue iteration from the break point of the
|
|
// current while loop.
|
|
const Value newPos = whileOp->getResult(o++);
|
|
// We need to define a new local variable for `tid` to avoid
|
|
// warnings about "captured structured bindings are a C++20 extension".
|
|
// FIXME(wrengr): define a helper function to capture this idiom!
|
|
const TensorId newTid = tid;
|
|
posits[newTid][lvl] = newPos;
|
|
|
|
// The coordinate is invalid now.
|
|
coords[tid][lvl] = nullptr;
|
|
// The segment high is invalid now.
|
|
segHi[tid][lvl] = nullptr;
|
|
// highs remains unchanged.
|
|
}
|
|
}
|
|
|
|
// Reduction value from users.
|
|
for (auto &i : reduc) {
|
|
operands.push_back(i);
|
|
// In place update reduction variable.
|
|
i = whileOp->getResult(o++);
|
|
}
|
|
|
|
// An (optional) universal index.
|
|
if (operands.size() + delta < whileOp.getNumResults()) {
|
|
assert(operands.size() + delta + 1 == whileOp.getNumResults());
|
|
// The last one is the universial index.
|
|
operands.push_back(ADDI(iv, one));
|
|
// update the loop starting point of current loop sequence
|
|
loopSeqStack.back().first = whileOp->getResult(o++);
|
|
}
|
|
|
|
assert(o == operands.size() + delta);
|
|
if (!operands.empty())
|
|
YIELD(operands);
|
|
|
|
builder.setInsertionPointAfter(whileOp);
|
|
}
|
|
|
|
void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
// Clean up the values, it would help use to discover potential bug at a
|
|
// earlier stage (instead of silently using a wrong value).
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
|
|
// Sets the insertion point to the right position.
|
|
rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock);
|
|
if (!loopInfo.userCodeBlock->empty() &&
|
|
llvm::isa<scf::YieldOp>(&loopInfo.userCodeBlock->back())) {
|
|
// scf::While/For inserts an implicit yield op when there is no loop
|
|
// iter args. In this case, we need to insert the code before the yield.
|
|
assert(loopInfo.userCodeBlock->back().getNumResults() == 0);
|
|
rewriter.setInsertionPoint(&loopInfo.userCodeBlock->back());
|
|
}
|
|
|
|
if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
|
|
exitWhileLoop(rewriter, loc, reduc);
|
|
} else {
|
|
exitForLoop(rewriter, loc, reduc);
|
|
}
|
|
|
|
assert(loopStack.size() == loopSeqStack.size());
|
|
loopStack.pop_back();
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Slice-driven loop related methods.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
unsigned LoopEmitter::remDepOnLevel(TensorId tid, Level lvl) const {
|
|
unsigned totalDependencies = dependentLvlMap[tid][lvl].size();
|
|
if (totalDependencies != 0) {
|
|
assert(totalDependencies >= 2);
|
|
return totalDependencies - levelReducedDep[tid][lvl];
|
|
}
|
|
return totalDependencies;
|
|
}
|
|
|
|
const LoopEmitter::SliceInfo &LoopEmitter::getMostRecentSliceOnLvl(TensorId tid,
|
|
Level lvl) {
|
|
// Finds the most-recent slice using a reverse iteration.
|
|
for (auto it = sliceStack[tid].rbegin(), ie = sliceStack[tid].rend(); it < ie;
|
|
it++) {
|
|
if (it->slicedOnLvl == lvl) { // the level matched
|
|
return *it;
|
|
}
|
|
}
|
|
llvm_unreachable("Failed to find sliceInfo");
|
|
}
|
|
|
|
// Generates a while loop to iterate over a slice sparse level as follows.
|
|
//
|
|
// while(coords[loopLo] < offset + size) {
|
|
// body_builder
|
|
// loopLo ++;
|
|
// }
|
|
std::pair<Operation *, ValueRange> LoopEmitter::genSliceLvlTraverseLoop(
|
|
OpBuilder &builder, Location loc, Value posLo, Value posHi, Value offset,
|
|
Value size, TensorId tid, Level lvl, ValueRange userReduc,
|
|
LoopBodyBuilder bodyBuilder) {
|
|
Value c1 = C_IDX(1);
|
|
auto [sliceSz, stride] = sliceMeta[tid][lvl].back();
|
|
assert(stride == 1 && "Not yet implemented");
|
|
Value sliceHi = ADDI(offset, sliceSz);
|
|
|
|
SmallVector<Value> reduc{posLo}; // loop lower bounds
|
|
const unsigned numMetaReduc = reduc.size();
|
|
|
|
// Append user required reduction value.
|
|
reduc.append(userReduc.begin(), userReduc.end());
|
|
scf::WhileOp whileOp = builder.create<scf::WhileOp>(
|
|
loc, ValueRange(reduc).getTypes(), reduc,
|
|
/*beforeBuilder=*/
|
|
[this, posHi, sliceHi, tid, lvl](OpBuilder &builder, Location loc,
|
|
ValueRange args) {
|
|
Value cond = genSparseReducedAffineCond(builder, loc, *lvls[tid][lvl],
|
|
sliceHi, args[0], posHi);
|
|
// continue if not yet break nor out of bound.
|
|
builder.create<scf::ConditionOp>(loc, cond, args);
|
|
},
|
|
/*afterBuilder=*/
|
|
[c1, numMetaReduc, bodyBuilder](OpBuilder &builder, Location loc,
|
|
ValueRange args) {
|
|
Value iv = args[0];
|
|
TypeRange types = args.drop_front(numMetaReduc).getTypes();
|
|
// The coordinate must be in bound as guaranteed by the loop
|
|
// condition. We generate a fake if operation here only to hide the
|
|
// extra loop induction variables maintained by us from users, which
|
|
// will be removed by later optimization pass.
|
|
auto ifOp = builder.create<scf::IfOp>(loc, types,
|
|
constantI1(builder, loc, true),
|
|
/*withElseBlock=*/!types.empty());
|
|
{
|
|
// 2 reduction variable maintained by us.
|
|
SmallVector<Value> ifRet = args.drop_front(numMetaReduc);
|
|
assert(ifRet.size() == args.size() - 1);
|
|
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// If coord >= sliceHi.
|
|
if (!ifRet.empty()) {
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
YIELD(ifRet);
|
|
}
|
|
|
|
// If coord < sliceHi.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
// Delegates to users' callback.
|
|
bodyBuilder(builder, loc, iv, ifRet);
|
|
}
|
|
// Marks this special ifOp to avoid sparisification finalizing it.
|
|
ifOp->setAttr(getLoopEmitterLoopAttrName(),
|
|
StringAttr::get(builder.getContext(), "slice"));
|
|
// Insertion point restored to after ifOp.
|
|
SmallVector<Value> yields;
|
|
// Increase induction variable.
|
|
yields.push_back(ADDI(iv, c1));
|
|
yields.append(ifOp.getResults().begin(), ifOp.getResults().end());
|
|
YIELD(yields);
|
|
});
|
|
|
|
builder.setInsertionPointAfter(whileOp);
|
|
return std::make_pair(whileOp, whileOp.getResults().drop_front(numMetaReduc));
|
|
}
|
|
|
|
// Generates a loop nest that traverse all the unresolved levels in between.
|
|
//
|
|
// for(int i = 0; i < slicePos.size(); i+=2) {
|
|
// loopLo = slicePos[i];
|
|
// loopHi = slicePos[i + 1];
|
|
//
|
|
// // Then the same loop generated by genSliceLvlTraverse above.
|
|
// while (loopLo < loopHI) {
|
|
// if (pos[loopLo] < sliceHi) {
|
|
// bodyBuilder();
|
|
// } else {
|
|
// break;
|
|
// }
|
|
// loopLo ++;
|
|
// }
|
|
// }
|
|
ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse(
|
|
OpBuilder &builder, Location loc, TensorId tid,
|
|
ArrayRef<const SliceInfo *> unResLvls,
|
|
std::optional<std::pair<TensorId, Level>> firstResLvl, ValueRange userReduc,
|
|
LoopBodyBuilder bodyBuilder) {
|
|
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1);
|
|
Value pos = c0;
|
|
OpBuilder::InsertPoint ip;
|
|
SmallVector<Value> innerArgs(userReduc.begin(), userReduc.end());
|
|
scf::ForOp outerMost = nullptr; // the outermost loop.
|
|
|
|
// Wraps body builder and inserts a extra counting instruction at the end.
|
|
auto wrapped = [bodyBuilder](OpBuilder &builder, Location loc, Value iv,
|
|
MutableArrayRef<Value> reduc) {
|
|
bodyBuilder(builder, loc, iv, reduc.drop_back());
|
|
// Increments the counter.
|
|
reduc.back() = ADDI(reduc.back(), C_IDX(1));
|
|
};
|
|
|
|
// FIXME: Need special handling when the previous unresolved slice is strided:
|
|
// We probably need to filter out coordinates that is not on stride.
|
|
if (firstResLvl.has_value()) {
|
|
// Overwrite position when the first level is fully resolved.
|
|
pos = posits[firstResLvl->first][firstResLvl->second];
|
|
ip = builder.saveInsertionPoint();
|
|
} else {
|
|
const SliceInfo &frontSlice = *unResLvls.back();
|
|
Level firstLvl = *frontSlice.slicedOnLvl;
|
|
if (!lvlFullyResolved(tid, firstLvl)) {
|
|
if (isCompressedLT(lvlTypes[tid][firstLvl])) {
|
|
// An extra counter that tracks how many segments are there in the child
|
|
// compressed level.
|
|
innerArgs.push_back(c0);
|
|
// Overrides the user-provided builder.
|
|
bodyBuilder = wrapped;
|
|
unsigned depth = frontSlice.depth - 1;
|
|
Value offset = frontSlice.offset;
|
|
Value sPtrBuf = slicePosBuffer[tid][firstLvl][depth];
|
|
Value mSz = frontSlice.posTupleNum;
|
|
outerMost = builder.create<scf::ForOp>(
|
|
loc, c0, mSz, c1, innerArgs,
|
|
[this, tid, firstLvl, offset, sPtrBuf, &ip, &pos,
|
|
&innerArgs](OpBuilder &builder, Location loc, Value iv,
|
|
ValueRange iterArgs) {
|
|
// generate traversal for each level.
|
|
Value loopLo =
|
|
loadSlicePos(builder, loc, sPtrBuf, iv, SlicePosKind::kLo);
|
|
Value loopHi =
|
|
loadSlicePos(builder, loc, sPtrBuf, iv, SlicePosKind::kHi);
|
|
// We need to remember the starting index for next level's
|
|
// position, because slice-driven loop breaks the level into
|
|
// non-consecutive segments.
|
|
updateSlicePos(builder, loc, sPtrBuf, iterArgs.back(), iv,
|
|
SlicePosKind::kNext);
|
|
|
|
auto [size, stride] = sliceMeta[tid][firstLvl].back();
|
|
assert(stride == 1 && "Not yet implemented");
|
|
ValueRange itArgs =
|
|
genSliceLvlTraverseLoop(
|
|
builder, loc, loopLo, loopHi, offset, size, tid, firstLvl,
|
|
iterArgs,
|
|
[&](OpBuilder &builder, Location, Value iv,
|
|
MutableArrayRef<Value> reduc) {
|
|
ip = builder.saveInsertionPoint();
|
|
pos = iv;
|
|
innerArgs.assign(reduc.begin(), reduc.end());
|
|
})
|
|
.second;
|
|
YIELD(itArgs);
|
|
});
|
|
} else if (isDenseLT(lvlTypes[tid][firstLvl])) {
|
|
assert(firstLvl == 0); // This must be the first level.
|
|
Value lb = frontSlice.offset;
|
|
auto [sliceSz, stride] =
|
|
sliceMeta[tid][*frontSlice.slicedOnLvl][frontSlice.depth];
|
|
assert(stride == 1 && "Not yet implemented");
|
|
Value ub = ADDI(lb, sliceSz);
|
|
outerMost = builder.create<scf::ForOp>(
|
|
loc, lb, ub, c1, innerArgs,
|
|
[&](OpBuilder &builder, Location loc, Value iv,
|
|
ValueRange iterArgs) {
|
|
ip = builder.saveInsertionPoint();
|
|
pos = iv;
|
|
innerArgs.assign(iterArgs.begin(), iterArgs.end());
|
|
});
|
|
}
|
|
// We generated the loop for the first slice above, now remove it.
|
|
unResLvls = unResLvls.drop_back();
|
|
}
|
|
}
|
|
// Reset the insertion point into the loop body.
|
|
builder.restoreInsertionPoint(ip);
|
|
if (!unResLvls.empty()) {
|
|
// Fills in dense slices levels in between.
|
|
SmallVector<Value> lbs, ubs, steps, lvlSzs;
|
|
for (const SliceInfo *slice : llvm::reverse(unResLvls)) {
|
|
Level sliceLvl = *slice->slicedOnLvl;
|
|
assert(isDenseLT(lvlTypes[tid][sliceLvl]));
|
|
Value offset = slice->offset;
|
|
auto [sliceSz, stride] = sliceMeta[tid][sliceLvl][slice->depth];
|
|
assert(stride == 1 && "Not yet implemented");
|
|
lbs.push_back(offset);
|
|
ubs.push_back(ADDI(offset, sliceSz));
|
|
steps.push_back(c1);
|
|
lvlSzs.push_back(lvlSizes[tid][sliceLvl]);
|
|
}
|
|
auto denseNest =
|
|
scf::buildLoopNest(builder, loc, lbs, ubs, steps, innerArgs,
|
|
[&innerArgs, &lvlSzs, &pos, bodyBuilder](
|
|
OpBuilder &builder, Location loc, ValueRange ivs,
|
|
ValueRange iterArgs) -> scf::ValueVector {
|
|
for (auto em : llvm::enumerate(ivs)) {
|
|
// Linearizes position: pos = (pos * lvlsize) +
|
|
// iv;
|
|
pos = MULI(pos, lvlSzs[em.index()]);
|
|
pos = ADDI(pos, em.value());
|
|
}
|
|
innerArgs.assign(iterArgs.begin(), iterArgs.end());
|
|
// Generates user request loop body.
|
|
bodyBuilder(builder, loc, pos, innerArgs);
|
|
return innerArgs;
|
|
});
|
|
|
|
if (!outerMost) {
|
|
// If the outermost loop has not been set, this is the outermost loop.
|
|
outerMost = denseNest.loops.front();
|
|
} else {
|
|
// Otherwise we need to generate yield operations to link the SSA chain.
|
|
YIELD(denseNest.results);
|
|
}
|
|
} else {
|
|
assert(outerMost);
|
|
// Generates user request loop body.
|
|
bodyBuilder(builder, loc, pos, innerArgs);
|
|
YIELD(innerArgs);
|
|
}
|
|
assert(outerMost);
|
|
// Insert after current while operation.
|
|
builder.setInsertionPointAfter(outerMost);
|
|
return outerMost.getResults();
|
|
}
|
|
|
|
void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1);
|
|
if (isDenseLT(lvlTypes[tid][lvl])) {
|
|
// Dense slice begin is trivial.
|
|
sliceStack[tid].emplace_back(/*minCoord=*/c0, /*offset=*/c0,
|
|
/*nonEmpty=*/constantI1(builder, loc, true),
|
|
c0, lvl, /*depth=*/1);
|
|
return;
|
|
}
|
|
auto [nxSz, stride] = sliceMeta[tid][lvl][1];
|
|
assert(stride == 1 && "Not yet implemented");
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl][0];
|
|
const SparseTensorLevel &stl = *lvls[tid][lvl];
|
|
|
|
Value p = lvl == 0 ? c0 : posits[tid][lvl - 1];
|
|
auto [pLo, pHi] = stl.peekRangeAt(builder, loc, p);
|
|
|
|
// Fills out pIdxBuffer[tid][lvl][0] with [pLo, pHi]
|
|
updateSlicePos(builder, loc, sPtrBuf, pLo, c0, SlicePosKind::kLo);
|
|
updateSlicePos(builder, loc, sPtrBuf, pHi, c0, SlicePosKind::kHi);
|
|
// Slice over a resolved parent, we only need one pair of pos hi and lo to
|
|
// specify the current slice.
|
|
Value tupleNum = c1;
|
|
// This is an non empty tensor if pLo < pHi.
|
|
Value isNonEmpty = CMPI(ult, pLo, pHi);
|
|
// The minimal coord must be at the first on ordered level.
|
|
// FIXME: Technically we should load the coord only when the slice is
|
|
// nonempty. though we assume that even on empty sparse tensors, a non-empty
|
|
// ptr/idx buffer is allocated for each level so it would not cause OOB to
|
|
// avoid generating a ifOp here.
|
|
Value minCrd = stl.peekCrdAt(builder, loc, pLo);
|
|
|
|
// FIXME: We need the relative offset related to the base slice.
|
|
Value absOffset = offsetFromMinCoord(builder, loc, minCrd, nxSz, isNonEmpty);
|
|
sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, tupleNum, lvl,
|
|
/*depth=*/1);
|
|
}
|
|
|
|
// Fills in the slicePosBuffer before slice-driven loop begin.
|
|
// TODO: it can only handle all compressed tensors.
|
|
//
|
|
// // Loop generated by `genUnResolvedSliceTreeTraverse`
|
|
// for(int i = 0; i < slicePos.size(); i+=2) {
|
|
// loopLo = slicePos[i];
|
|
// loopHi = slicePos[i + 1];
|
|
// minCrd = max;
|
|
// while (loopLo < loopHi) {
|
|
// if (pos[loopLo] < sliceHi) {
|
|
// // bodyBuilder
|
|
// slicePos[tid].push_back(pos[loopLo]);
|
|
// slicePos[tid].push_back(pos[loopLo + 1]);
|
|
// minCrd = min(minCrd, crd[pos[loopLo]]);
|
|
// } else {
|
|
// break;
|
|
// }
|
|
// loopLo ++;
|
|
// }
|
|
// }
|
|
void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
Value c0 = C_IDX(0);
|
|
unsigned depth = levelReducedDep[tid][lvl];
|
|
// The remaining slice size after reduction.
|
|
Value remSz = sliceMeta[tid][lvl][depth + 1].first;
|
|
// Dense slice begin is trivial
|
|
if (isDenseLT(lvlTypes[tid][lvl])) {
|
|
sliceStack[tid].emplace_back(c0, c0, constantI1(builder, loc, false), c0,
|
|
lvl, depth + 1);
|
|
return;
|
|
}
|
|
|
|
assert(isCompressedLT(lvlTypes[tid][lvl]));
|
|
// Unhandled Cases:
|
|
//
|
|
// 1st, lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one
|
|
// variable need to be reduced on the same level).
|
|
//
|
|
// 2nd, lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a
|
|
// simple dim expression in between).
|
|
assert(lvl == *sliceStack[tid].back().slicedOnLvl + 1);
|
|
|
|
SmallVector<const SliceInfo *> unResSlices;
|
|
std::optional<std::pair<TensorId, Level>> firstResLvl;
|
|
for (Level curLvl = lvl; curLvl >= 1; curLvl--) {
|
|
Level prevLvl = curLvl - 1;
|
|
if (lvlFullyResolved(tid, prevLvl)) {
|
|
firstResLvl = std::make_pair(tid, prevLvl);
|
|
break;
|
|
}
|
|
unResSlices.push_back(&getMostRecentSliceOnLvl(tid, prevLvl));
|
|
if (!isDenseLT(lvlTypes[tid][prevLvl])) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
assert(!unResSlices.empty() &&
|
|
!lvlFullyResolved(tid, *unResSlices.front()->slicedOnLvl));
|
|
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl].back();
|
|
SmallVector<Value, 3> reduc = {
|
|
constantI1(builder, loc, false), // isNonEmpty
|
|
lvlSizes[tid][lvl], // minCoord
|
|
c0, // memSize
|
|
};
|
|
|
|
ValueRange result = genUnResolvedSliceTreeTraverse(
|
|
builder, loc, tid, unResSlices, firstResLvl, reduc,
|
|
[this, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc, Value iv,
|
|
MutableArrayRef<Value> reduc) {
|
|
Value &nonEmpty = reduc[0];
|
|
Value &minCrd = reduc[1];
|
|
Value &curTupleCnt = reduc[2];
|
|
|
|
const SparseTensorLevel &stl = *lvls[tid][lvl];
|
|
auto [sPLo, sPHi] = stl.peekRangeAt(builder, loc, iv);
|
|
|
|
// isNonEmpty = isNonEmpty || lvlNonEmpty, i.e., as long as there is
|
|
// one non-empty lvl, the slice is non-empty.
|
|
Value lvlNonEmpty = CMPI(ult, sPLo, sPHi);
|
|
nonEmpty = builder.create<arith::OrIOp>(loc, lvlNonEmpty, nonEmpty);
|
|
|
|
// Update the minimum coordinate.
|
|
auto ifNonEmpty = builder.create<scf::IfOp>(loc, builder.getIndexType(),
|
|
lvlNonEmpty, true);
|
|
{
|
|
// Generate Code as follows.
|
|
//
|
|
// if (nonEmpty) {
|
|
// minCrd = min(minCrd, crd[pos[pLo]]);
|
|
// }
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
builder.setInsertionPointToStart(ifNonEmpty.thenBlock());
|
|
Value curC = stl.peekCrdAt(builder, loc, sPLo);
|
|
Value isSmaller = CMPI(ult, curC, minCrd);
|
|
Value newMin = SELECT(isSmaller, curC, minCrd);
|
|
YIELD(newMin);
|
|
builder.setInsertionPointToStart(ifNonEmpty.elseBlock());
|
|
YIELD(minCrd);
|
|
}
|
|
minCrd = ifNonEmpty.getResult(0);
|
|
updateSlicePos(builder, loc, sPtrBuf, sPLo, curTupleCnt,
|
|
SlicePosKind::kLo);
|
|
updateSlicePos(builder, loc, sPtrBuf, sPHi, curTupleCnt,
|
|
SlicePosKind::kHi);
|
|
curTupleCnt = ADDI(curTupleCnt, C_IDX(1));
|
|
});
|
|
|
|
Value isNonEmpty = result[0];
|
|
Value minCrd = result[1];
|
|
// Two metadata [memSize, idx].
|
|
// FIXME: we need the relative offset related to the base slice.
|
|
Value absOffset = offsetFromMinCoord(builder, loc, minCrd, remSz, isNonEmpty);
|
|
sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, result[2], lvl,
|
|
depth + 1);
|
|
}
|
|
|
|
bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level lvl) {
|
|
Value curLvlIdx = C_IDX(0);
|
|
if (depFullyReduced(tid, lvl)) {
|
|
if (lvl == 0 || trivialSlice[tid][lvl]) {
|
|
sliceTupleNxStartIdx[tid][lvl] = C_IDX(0);
|
|
} else {
|
|
if (isDenseLT(lvlTypes[tid][lvl])) {
|
|
sliceTupleNxStartIdx[tid][lvl] = sliceTupleNxStartIdx[tid][lvl - 1];
|
|
} else {
|
|
assert(isCompressedLT(lvlTypes[tid][lvl]));
|
|
curLvlIdx = ADDI(sliceTupleNxStartIdx[tid][lvl - 1],
|
|
sliceTupleFwdCnt[0][lvl - 1]);
|
|
sliceTupleNxStartIdx[tid][lvl] =
|
|
loadSlicePos(builder, loc, slicePosBuffer[tid][lvl].back(),
|
|
curLvlIdx, SlicePosKind::kNext);
|
|
}
|
|
}
|
|
if (isDenseLT(lvlTypes[tid][lvl]))
|
|
return true;
|
|
|
|
Value sPosBuf = slicePosBuffer[tid][lvl].back();
|
|
// If constraints on the tensor is fully resolved. We do not need to
|
|
// generates slice begin any more, instead we fall back to TACO-based
|
|
// algorithm to (co)iterates over the slice.
|
|
Value tupleIdx = curLvlIdx;
|
|
posits[tid][lvl] =
|
|
loadSlicePos(builder, loc, sPosBuf, tupleIdx, SlicePosKind::kLo);
|
|
highs[tid][lvl] =
|
|
loadSlicePos(builder, loc, sPosBuf, tupleIdx, SlicePosKind::kHi);
|
|
return true;
|
|
}
|
|
|
|
// Only when the level is sorted, the next-non-empty slice can be computed
|
|
// efficiently.
|
|
const LevelType lvlType = lvlTypes[tid][lvl];
|
|
assert(isOrderedLT(lvlType));
|
|
if (isSingletonLT(lvlType)) {
|
|
llvm_unreachable("TODO: dense level should be easy to support, while "
|
|
"singleton level requires more efforts");
|
|
}
|
|
|
|
assert(!dependentLvlMap[tid][lvl].empty());
|
|
assert(!sliceStack[tid].empty());
|
|
|
|
const SliceInfo &sliceInfo = sliceStack[tid].back();
|
|
auto baseEnc = getSparseTensorEncoding(tensors[tid].getType());
|
|
if (baseEnc.isSlice())
|
|
llvm_unreachable("TODO: not yet implemented");
|
|
|
|
if (sliceInfo.isInitialTensor() ||
|
|
(lvl >= 1 && lvlFullyResolved(tid, lvl - 1))) {
|
|
// First level or previous level has been full resolved.
|
|
trivialSlice[tid][lvl] = true;
|
|
genResolvedSliceBegin(builder, loc, tid, lvl);
|
|
} else {
|
|
// The previous level has not been full resolved.
|
|
trivialSlice[tid][lvl] = false;
|
|
genUnResolvedSliceBegin(builder, loc, tid, lvl);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
std::tuple<Value, Value, Value>
|
|
LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
if (!isCompressedLT(lvlTypes[tid][lvl]))
|
|
llvm_unreachable("TODO");
|
|
|
|
// else generate code to compute next non empty slice.
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1);
|
|
|
|
SliceInfo &info = sliceStack[tid].back();
|
|
assert(info.slicedOnLvl == lvl);
|
|
//
|
|
// We forward to the next non empty slice by
|
|
// if (minCrd > offset) {
|
|
// offset += 1
|
|
// } else {
|
|
// minCrd = nextMinInSlice();
|
|
// offset = minCrd - size + 1;
|
|
// }
|
|
//
|
|
// if (offset + size > parents.size)
|
|
// isNonEmpty = false;
|
|
//
|
|
Value absOffset = info.offset;
|
|
SmallVector<Value, 3> reduc = {info.minCrd, info.isNonEmpty, absOffset};
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl][info.depth - 1];
|
|
Value fastPathP = CMPI(ugt, info.minCrd, absOffset);
|
|
auto ifOp = builder.create<scf::IfOp>(loc, ValueRange(reduc).getTypes(),
|
|
fastPathP, true);
|
|
{
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// Take the fast path
|
|
// if (minCrd > offset) {
|
|
// return offset += 1
|
|
// }
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
reduc[2] = ADDI(absOffset, c1);
|
|
// Yield offset + 1.
|
|
YIELD(reduc);
|
|
|
|
// else /*minCrd == offset*/ {
|
|
// for (i = 0; i < slicePos.size(); i+=kSliceIterWidth) {
|
|
// if (crd[pos[slicePos[i]]] == minCrd) {
|
|
// slicePos[i]++;
|
|
// }
|
|
// minCrd=min(minCrd, crd[pos[slicePos[i]]]);
|
|
// }
|
|
// offset = minCrd - size + 1;
|
|
// }
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
reduc[2] = absOffset; // restore value.
|
|
Value mSz = info.posTupleNum; // tuple number.
|
|
reduc[0] = lvlSizes[tid][lvl]; // next min coord
|
|
reduc[1] = constantI1(builder, loc, false); // isNonEmpty
|
|
auto loopArgs = static_cast<ValueRange>(reduc).drop_back();
|
|
auto forOp = scf::buildLoopNest(
|
|
builder, loc, c0, mSz, c1, loopArgs,
|
|
[this, tid, lvl, c1, sPtrBuf,
|
|
&info](OpBuilder &builder, Location loc, ValueRange ivs,
|
|
ValueRange iterArgs) -> scf::ValueVector {
|
|
Value curMinCrd = iterArgs[0];
|
|
Value isNonEmpty = iterArgs[1];
|
|
|
|
Type idxTp = builder.getIndexType();
|
|
Value pLo = loadSlicePos(builder, loc, sPtrBuf, ivs.front(),
|
|
SlicePosKind::kLo);
|
|
Value pHi = loadSlicePos(builder, loc, sPtrBuf, ivs.front(),
|
|
SlicePosKind::kHi);
|
|
//
|
|
// if (pLo < pHi) // Only loads when inbound.
|
|
// coord = load[pLo]
|
|
// if coord == minCrd
|
|
// pLo += 1
|
|
//
|
|
// if (pLo < pHi)
|
|
// curMinCrd = min(curMinCrd, load[pLo])
|
|
//
|
|
Value pred = CMPI(ult, pLo, pHi);
|
|
auto advPLo = builder.create<scf::IfOp>(loc, idxTp, pred, true);
|
|
/* if pLo < pHi */ {
|
|
builder.setInsertionPointToStart(&advPLo.getThenRegion().front());
|
|
// coord = load[pLo]
|
|
Value coord = lvls[tid][lvl]->peekCrdAt(builder, loc, pLo);
|
|
Value pred = CMPI(eq, coord, info.minCrd);
|
|
auto ifEqual = builder.create<scf::IfOp>(loc, idxTp, pred, true);
|
|
/* if coord == minCrd */ {
|
|
builder.setInsertionPointToStart(
|
|
&ifEqual.getThenRegion().front());
|
|
Value newPlo = ADDI(pLo, c1);
|
|
// Updates the cache.
|
|
updateSlicePos(builder, loc, sPtrBuf, newPlo, ivs.front(),
|
|
SlicePosKind::kLo);
|
|
YIELD(newPlo);
|
|
}
|
|
/* else coord != minCrd */ {
|
|
builder.setInsertionPointToStart(
|
|
&ifEqual.getElseRegion().front());
|
|
YIELD(pLo);
|
|
}
|
|
builder.setInsertionPointAfter(ifEqual);
|
|
YIELD(ifEqual.getResults());
|
|
}
|
|
/* else pLo >= pHi */ {
|
|
builder.setInsertionPointToStart(&advPLo.getElseRegion().front());
|
|
YIELD(pLo);
|
|
}
|
|
|
|
builder.setInsertionPointAfter(advPLo);
|
|
pLo = advPLo.getResult(0);
|
|
Value lvlNonEmpty = CMPI(ult, pLo, pHi);
|
|
// Update minCrds
|
|
auto newMin =
|
|
builder.create<scf::IfOp>(loc, idxTp, lvlNonEmpty, true);
|
|
builder.setInsertionPointToStart(&newMin.getThenRegion().front());
|
|
YIELD(lvls[tid][lvl]->peekCrdAt(builder, loc, pLo));
|
|
|
|
builder.setInsertionPointToStart(&newMin.getElseRegion().front());
|
|
YIELD(curMinCrd);
|
|
builder.setInsertionPointAfter(newMin);
|
|
|
|
// isNonEmpty = isNonEmpty || lvlNonEmpty
|
|
isNonEmpty =
|
|
builder.create<arith::OrIOp>(loc, lvlNonEmpty, isNonEmpty);
|
|
curMinCrd = builder.create<arith::SelectOp>(
|
|
loc, CMPI(ult, newMin.getResult(0), curMinCrd),
|
|
newMin.getResult(0), curMinCrd);
|
|
return {curMinCrd, isNonEmpty};
|
|
});
|
|
|
|
builder.setInsertionPointAfter(forOp.loops.front());
|
|
// minOffset = minCrd + 1 >= size ? minCrd + 1 - size : c0
|
|
Value tmp = ADDI(forOp.results.front(), c1);
|
|
auto [size, stride] = sliceMeta[tid][lvl][info.depth];
|
|
assert(stride == 1 && "Not yet implemented");
|
|
Value minOffset = SUBI(tmp, size);
|
|
Value p = CMPI(uge, tmp, size);
|
|
minOffset = SELECT(p, minOffset, c0);
|
|
|
|
SmallVector<Value, 3> yields;
|
|
yields.assign(forOp.results.begin(), forOp.results.end());
|
|
yields.push_back(minOffset);
|
|
YIELD(yields);
|
|
}
|
|
|
|
Value nextMinCrd = ifOp.getResults()[0];
|
|
Value nextNonEmpty = ifOp.getResults()[1];
|
|
|
|
// The next offset should at least be offset + 1;
|
|
Value minOffset = ifOp.getResults()[2];
|
|
Value nxOffset = ADDI(info.offset, c1);
|
|
Value maxPred = CMPI(ugt, minOffset, nxOffset);
|
|
Value nextAbsOffset = SELECT(maxPred, minOffset, nxOffset);
|
|
|
|
auto [size, stride] = sliceMeta[tid][lvl][info.depth];
|
|
assert(stride == 1 && "Not yet implemented");
|
|
Value sliceUB = ADDI(nextAbsOffset, size);
|
|
|
|
// FIXME: this only works if there is only one parent.
|
|
assert(info.depth - 1 == 0);
|
|
// nextNonEmpty = nextNonEmpty && slice upper bound <= parent upperbound.
|
|
nextNonEmpty = ANDI(nextNonEmpty, CMPI(ule, sliceUB, lvlSizes[tid][lvl]));
|
|
|
|
// FIXME: compute relative offset.
|
|
assert(info.depth - 1 == 0);
|
|
return std::make_tuple(nextNonEmpty, nextMinCrd, nextAbsOffset);
|
|
}
|
|
|
|
#undef CMPI
|
|
#undef C_IDX
|
|
#undef YIELD
|
|
#undef ADDI
|
|
#undef ANDI
|
|
#undef SUBI
|
|
#undef MULI
|
|
#undef SELECT
|