//===- LoopEmitter.cpp ----------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "LoopEmitter.h" #include "CodegenUtils.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" using namespace mlir; using namespace mlir::sparse_tensor; //===----------------------------------------------------------------------===// // File local helper functions. //===----------------------------------------------------------------------===// /// Generates a position/coordinate load from the sparse storage scheme. /// Narrower data types need to be zero extended before casting the /// value into the `Index` type used for looping and indexing. static Value genIndexLoad(OpBuilder &builder, Location loc, Value mem, Value s) { // For the scalar case, we simply zero extend narrower indices into 64-bit // values before casting to index without a performance penalty. Here too, // however, indices that already are 64-bit, in theory, cannot express the // full range as explained above. Value load = builder.create(loc, mem, s); if (!load.getType().isa()) { if (load.getType().getIntOrFloatBitWidth() < 64) load = builder.create(loc, builder.getI64Type(), load); load = builder.create(loc, builder.getIndexType(), load); } return load; } static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor, Level lvl) { auto enc = getSparseTensorEncoding(tensor.getType()); // FIXME: `toOrigDim` is deprecated return createOrFoldSliceOffsetOp(builder, loc, tensor, toOrigDim(enc, lvl)); } static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor, Level lvl) { auto enc = getSparseTensorEncoding(tensor.getType()); // FIXME: `toOrigDim` is deprecated return createOrFoldSliceStrideOp(builder, loc, tensor, toOrigDim(enc, lvl)); } /// Converts a coordinate relative to the slice to the coordinate relative /// to the underlying tensor. // FIXME: that description says "sliceCrd -> tensorCrd"; but the function // name suggests it should be "tensorCrd -> sliceCrd". static Value toSliceCrd(OpBuilder &builder, Location loc, Value crd, Value offset, Value stride, Value tensor, Level lvl) { // tensorCrd = sliceCrd * stride + offset crd = builder.create(loc, crd, stride); crd = builder.create(loc, crd, offset); return crd; } /// Converts a coordinate relative to the underlying tensor to the coordinate /// relative to the slice, returns a extra reminder value // FIXME: that description says "tensorCrd -> sliceCrd"; but the function // name suggests it should be "sliceCrd -> tensorCrd". static std::pair fromSliceCrd(OpBuilder &builder, Location loc, Value crd, Value offset, Value stride, Value tensor, Level lvl) { // sliceCrd = (tensorCrd - offset) / stride crd = builder.create(loc, crd, offset); Value rem = builder.create(loc, crd, stride); crd = builder.create(loc, crd, stride); return std::make_pair(crd, rem); } std::pair LoopEmitter::genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd, TensorId tid, Level lvl) { assert(isSparseSlices[tid]); Value slice = tensors[tid]; Value offset = sliceOffsets[tid][lvl]; Value stride = sliceStrides[tid][lvl]; auto enc = getSparseTensorEncoding(slice.getType()); const auto [newCrd, crdRem] = fromSliceCrd(builder, loc, crd, offset, stride, slice, lvl); SmallVector conds; // at most 3 conditions // First, coord >= offset (skip the check if offset is known to be 0). if (auto staticOffset = enc.getStaticLvlSliceOffset(lvl); !(staticOffset.has_value() && *staticOffset == 0)) { auto geOffset = builder.create( loc, arith::CmpIPredicate::uge, crd, offset); conds.push_back(geOffset); } // Second, coord_in_slice < length auto ltLength = builder.create(loc, arith::CmpIPredicate::ult, newCrd, lvlSizes[tid][lvl]); conds.push_back(ltLength); // Third, rem == 0 (skip the check if stride is known to be 1). if (auto staticStride = enc.getStaticLvlSliceStride(lvl); !(staticStride.has_value() && *staticStride == 1)) { auto fitStride = builder.create( loc, arith::CmpIPredicate::eq, crdRem, constantIndex(builder, loc, 0)); conds.push_back(fitStride); } // Must meet all condition to be a valid coordinate in slice. auto pred = conds.front(); for (auto cond : ValueRange(conds).drop_front()) pred = builder.create(loc, pred, cond); return {newCrd, pred}; } //===----------------------------------------------------------------------===// // Sparse tensor loop emitter class implementations //===----------------------------------------------------------------------===// Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value crd) { Value pos = lvl == 0 ? constantIndex(builder, loc, 0) : posits[tid][lvl - 1]; Value mul = builder.create(loc, highs[tid][lvl], pos); if (isSparseSlices[tid]) crd = toSliceCrd(builder, loc, crd, sliceOffsets[tid][lvl], sliceStrides[tid][lvl], tensors[tid], lvl); Value add = builder.create(loc, mul, crd); return add; } Value LoopEmitter::genSegmentHigh(OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value pLo, Value pHi) { const auto coordinates = coordinatesBuffers[tid][lvl]; const auto sameCrd = genIndexLoad(builder, loc, coordinates, pLo); auto whileOp = builder.create( loc, builder.getIndexType(), pLo, /*beforeBuilder=*/ [pHi, coordinates, sameCrd](OpBuilder &builder, Location loc, ValueRange ivs) { const auto pos = ivs[0]; Value inBound = builder.create( loc, arith::CmpIPredicate::ult, pos, pHi); auto ifInBound = builder.create(loc, builder.getI1Type(), inBound, true); { OpBuilder::InsertionGuard guard(builder); // Load the next coordinates only when inbound (to avoid OOB // acccesses). builder.setInsertionPointToStart(ifInBound.thenBlock()); Value crd = genIndexLoad(builder, loc, coordinates, pos); Value isSameCrd = builder.create( loc, arith::CmpIPredicate::eq, crd, sameCrd); builder.create(loc, isSameCrd); // Else, the position is out of bound, yield false to terminate the // loop. builder.setInsertionPointToStart(ifInBound.elseBlock()); builder.create(loc, constantI1(builder, loc, false)); } builder.create(loc, ifInBound.getResults()[0], ivs); }, /*afterBuilder=*/ [](OpBuilder &builder, Location loc, ValueRange ivs) { // pos ++ Value nextPos = builder.create( loc, ivs[0], constantIndex(builder, loc, 1)); builder.create(loc, nextPos); }); // Return the segment high. return whileOp.getResult(0); } Value LoopEmitter::genSparseCrd(OpBuilder &builder, Location loc, TensorId tid, Level dstLvl) { Value crd = constantIndex(builder, loc, 0); const auto reassoc = getCollapseReassociation(tid, dstLvl); const unsigned reassocSize = reassoc.size(); for (unsigned i = 0; i < reassocSize; i++) { const Level srcLvl = reassoc[i]; // A load on the coordinates array yields the coordinate. const Value mem = coordinatesBuffers[tid][srcLvl]; /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. const Value pos = posits[tid][dstLvl]; const Value off = genIndexLoad(builder, loc, mem, pos); // Linearized the coordinates within the same collapse reassociation. crd = builder.create(loc, crd, off); if (i != reassocSize - 1) { crd = builder.create(loc, crd, this->lvlSizes[tid][reassoc[i + 1]]); } } return crd; } LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput, bool isSparseOut, ArrayRef topSort, DependentLvlGetter dimGetter) { initialize(tensors, loopTag, hasOutput, isSparseOut, topSort, dimGetter); } void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput, bool isSparseOut, ArrayRef topSort, DependentLvlGetter dimGetter) { // First initialize the top-level type of the fields. this->loopTag = loopTag; this->hasOutput = hasOutput; this->isSparseOut = isSparseOut; const TensorId numTensors = ts.size(); this->tensors.assign(ts.begin(), ts.end()); this->lvlTypes.assign(numTensors, std::vector()); this->lvlSizes.assign(numTensors, std::vector()); this->highs.assign(numTensors, std::vector()); this->segHi.assign(numTensors, std::vector()); this->posits.assign(numTensors, std::vector()); this->coords.assign(numTensors, std::vector()); this->positionsBuffers.assign(numTensors, std::vector()); this->coordinatesBuffers.assign(numTensors, std::vector()); this->valBuffer.assign(numTensors, nullptr); this->collapseReassoc.assign(numTensors, nullptr); this->isSparseSlices.assign(numTensors, false); this->sliceOffsets.assign(numTensors, std::vector()); this->sliceStrides.assign(numTensors, std::vector()); const LoopOrd numLoops = topSort.size(); // These zeros will be overwritten below, but we need to initialize // them to something since we'll need random-access assignment. this->loopIdToOrd.assign(numLoops, 0); this->loopStack.reserve(numLoops); this->loopSeqStack.reserve(numLoops); this->dependentLvlMap.assign( numTensors, std::vector>>()); // Initialize nested types of `TensorId`-indexed fields. for (TensorId tid = 0; tid < numTensors; tid++) { const Value t = tensors[tid]; // a scalar or 0-dimension tensors if (isZeroRankedTensorOrScalar(t.getType())) continue; auto rtp = getRankedTensorType(t); if (auto reshape = t.getDefiningOp(); isUniqueCOOType(rtp) && reshape) { // TODO: Supports more kinds of sparse tensors. // FIXME: We should instead lower reshape operations on sparse tensors to // view change. collapseReassoc[tid] = reshape.getReassociation(); rtp = reshape.getSrcType(); // Overwrites the tensor to the source tensor of reshape operations. tensors[tid] = reshape.getSrc(); } const SparseTensorType stt(rtp); const Level lvlRank = stt.getLvlRank(); // We always treat sparse output tensor as dense so that we always iterate // it based on lvl size. if (stt.hasEncoding() && !(isOutputTensor(tid) && isSparseOut)) { const auto enc = stt.getEncoding(); isSparseSlices[tid] = enc.isSlice(); for (auto lvlTp : enc.getDimLevelType()) lvlTypes[tid].push_back(lvlTp); } else { lvlTypes[tid].assign(lvlRank, DimLevelType::Dense); } // Initialize using empty value. lvlSizes[tid].assign(lvlRank, Value()); highs[tid].assign(lvlRank, Value()); segHi[tid].assign(lvlRank, Value()); posits[tid].assign(lvlRank, Value()); coords[tid].assign(lvlRank, Value()); positionsBuffers[tid].assign(lvlRank, Value()); coordinatesBuffers[tid].assign(lvlRank, Value()); sliceOffsets[tid].assign(lvlRank, Value()); sliceStrides[tid].assign(lvlRank, Value()); dependentLvlMap[tid].assign(lvlRank, std::vector>()); if (dimGetter) for (Level l = 0; l < lvlRank; l++) dependentLvlMap[tid][l] = dimGetter(tid, l); } // Construct the inverse of the `topSort` from the sparsifier. // This is needed to map `AffineDimExpr`s back to the `LoopOrd` // used in loop emitter. // FIXME: This map should be maintained outside loop emitter. for (LoopOrd n = 0; n < numLoops; n++) loopIdToOrd[topSort[n]] = n; } void LoopEmitter::initializeLoopEmit(OpBuilder &builder, Location loc, LoopEmitter::OutputUpdater updater) { // For every tensor: // * get the values buffer. // * For every level: // * get the positions and coordinates buffers // * get/compute the level-size, which is also used as the upper-bound // on positions. for (TensorId t = 0, numTensors = getNumTensors(); t < numTensors; t++) { const Value tensor = tensors[t]; const auto rtp = tensor.getType().dyn_cast(); if (!rtp) // Skips only scalar, zero ranked tensor still need to be bufferized and // (probably) filled with zeros by users. continue; // FIXME: the definition of `lvlRank` looks more like a dim-rank; // but the variable is used as a level everywhere below, which // suggests there may be some dim/lvl confusion going on here. const Level lvlRank = rtp.getRank(); const auto shape = rtp.getShape(); const auto enc = getSparseTensorEncoding(rtp); const Level cooStart = enc ? getCOOStart(enc) : lvlRank; // Scan all levels of current tensor. for (Level l = 0; l < lvlRank; l++) { // This should be called only once at beginning. assert(!positionsBuffers[t][l] && !coordinatesBuffers[t][l] && !highs[t][l]); const auto lvlTp = lvlTypes[t][l]; // Handle sparse storage schemes. if (isCompressedDLT(lvlTp)) { // Generate sparse primitives to obtain positions and coordinates. positionsBuffers[t][l] = genToPositions(builder, loc, tensor, l); coordinatesBuffers[t][l] = genToCoordinates(builder, loc, tensor, l, cooStart); } else if (isSingletonDLT(lvlTp)) { // Singleton level, fetch coordinates. coordinatesBuffers[t][l] = genToCoordinates(builder, loc, tensor, l, cooStart); } else { // Dense level, nothing to fetch. assert(isDenseDLT(lvlTp)); } // FIXME: `toOrigDim` is deprecated. For now this relies on the // 1:1 mapping between levels and dimensions, since nowhere else // in the code supports HigherOrdering yet either. Value lvlSz = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, toOrigDim(enc, l)); // Find upper bound in current dimension. highs[t][l] = lvlSizes[t][l] = lvlSz; if (isSparseSlices[t]) { sliceOffsets[t][l] = genSliceOffset(builder, loc, tensors[t], l); sliceStrides[t][l] = genSliceStride(builder, loc, tensors[t], l); } } // Perform the required bufferization. Dense inputs materialize // from the input tensors. Sparse inputs use sparse primitives to obtain the // values. // Delegates extra output initialization to clients. bool isOutput = isOutputTensor(t); Type elementType = rtp.getElementType(); if (!enc) { // Non-annotated dense tensors. BaseMemRefType denseTp = MemRefType::get(shape, elementType); // TODO: if we unconditionally use fully dynamic layout here, it breaks // some vectorization passes which requires static stride = 1. // Is it possible to call vectorization pass after bufferization? if (llvm::isa_and_nonnull(tensor.getDefiningOp())) denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp); Value denseVal = builder.create(loc, denseTp, tensor); // Dense outputs need special handling. if (isOutput && updater) denseVal = updater(builder, loc, denseVal, tensor); valBuffer[t] = denseVal; } else { // Annotated sparse tensors. // We also need the value buffer for all-dense annotated "sparse" tensors. valBuffer[t] = genToValues(builder, loc, tensor); } // NOTE: we can also prepare for 0 lvl here in advance, this will hoist // some loop preparation from tensor iteration, but will also (undesirably) // hoist the code ouside if-conditions. } } void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls) { // TODO: sort assert(loopSeqStack.size() == loopStack.size()); // Universal Index starts from 0. loopSeqStack.emplace_back(constantIndex(builder, loc, 0)); // Prepares for all the tensors used in the current loop sequence. assert(tids.size() == lvls.size()); for (auto [tid, lvl] : llvm::zip(tids, lvls)) prepareLoopOverTensorAtLvl(builder, loc, tid, lvl); } Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) { switch (a.getKind()) { case AffineExprKind::DimId: { // FIXME: since the one callsite in Sparsification passes in a // level-expression, the `getPosition` must in fact be a `Dimension`. // However, elsewhere we have been lead to expect that `loopIdToOrd` // should be indexed by `LoopId`... const LoopId i = a.cast().getPosition(); return loopStack[loopIdToOrd[i]].iv; } case AffineExprKind::Add: { auto binOp = a.cast(); return builder.create( loc, genAffine(builder, loc, binOp.getLHS()), genAffine(builder, loc, binOp.getRHS())); } case AffineExprKind::Mul: { auto binOp = a.cast(); return builder.create( loc, genAffine(builder, loc, binOp.getLHS()), genAffine(builder, loc, binOp.getRHS())); } case AffineExprKind::Constant: { int64_t c = a.cast().getValue(); return constantIndex(builder, loc, c); } default: llvm_unreachable("unexpected affine subscript"); } } Operation *LoopEmitter::enterLoopOverTensorAtLvl( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls, MutableArrayRef reduc, bool isParallel) { // TODO: support multiple return on parallel for? assert(!isParallel || reduc.size() <= 1); bool isSparseInput = false; TensorId tid = tids.front(); Level dstLvl = lvls.front(); assert(tids.size() == lvls.size()); for (auto [t, l] : llvm::zip(tids, lvls)) { // TODO: this check for validity of the (t,l) pairs should be // checked/enforced at the callsites, if possible. assert(t < lvlTypes.size() && l < lvlTypes[t].size()); assert(!coords[t][l]); // We cannot re-enter the same level const auto lvlTp = lvlTypes[t][l]; const bool isSparse = isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp); // Must be a recognizable level-type. assert(isSparse || isDenseDLT(lvlTp)); // We can at most have one sparse input, otherwise, a while loop is required // to co-iterate multiple sparse tensors. assert(!isSparseInput || !isSparse); if (isSparse) { tid = t; dstLvl = l; } isSparseInput = isSparseInput || isSparse; } const auto reassoc = getCollapseReassociation(tid, dstLvl); // TODO: support dynamic slices. // Use the first source-level here to build the loop bound (which is // also the biggest range). const Level srcLvl = reassoc.front(); const Value step = constantIndex(builder, loc, 1); /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. const Value lo = isSparseInput ? posits[tid][srcLvl] // current position : loopSeqStack.back(); // universal index const Value hi = highs[tid][srcLvl]; Operation *loop = nullptr; Value iv; if (isParallel) { assert(collapseReassoc[tid] == nullptr); scf::ParallelOp parOp = builder.create(loc, lo, hi, step, reduc); builder.setInsertionPointToStart(parOp.getBody()); assert(parOp.getNumReductions() == reduc.size()); iv = parOp.getInductionVars()[0]; // In-place update on the reduction variable vector. // Note that the init vals is not the actual reduction variables but instead // used as a "special handle" to (temporarily) represent them. The // expression on init vals will be moved into scf.reduce and replaced with // the block arguments when exiting the loop (see exitForLoop). This is // needed as we can not build the actual reduction block and get the actual // reduction varaible before users fill parallel loop body. for (int i = 0, e = reduc.size(); i < e; i++) reduc[i] = parOp.getInitVals()[i]; loop = parOp; } else { scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); builder.setInsertionPointToStart(forOp.getBody()); iv = forOp.getInductionVar(); // In-place update on the reduction variable vector. assert(forOp.getNumRegionIterArgs() == reduc.size()); for (int i = 0, e = reduc.size(); i < e; i++) reduc[i] = forOp.getRegionIterArg(i); loop = forOp; } assert(loop && iv); Value crd; if (isSparseInput) { assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); // For COO, the position is the same across consecutive levels. /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. llvm::for_each(reassoc, [this, tid, iv](Level srcLvl) { posits[tid][srcLvl] = iv; }); crd = genSparseCrd(builder, loc, tid, dstLvl); } else { // Dense tensor, the coordinate is the inducation variable. crd = iv; } if (isSparseSlices[tid] && isSparseInput) { // For sparse level slices, we need to filter out invalid coordinates that // are not included in the slice. SmallVector types; for (Value red : reduc) types.push_back(red.getType()); auto [trans, pred] = genSliceLegitPredicate(builder, loc, crd, tid, srcLvl); bool hasReduc = !types.empty(); scf::IfOp ifOp = builder.create(loc, types, pred, /*else*/ hasReduc); if (hasReduc) { // scf.for (a) -> v // %s = scf.if (a) -> v // user-generated code. // else // yield a // yield %s builder.create(loc, ifOp.getResults()); builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); // On mismatch. builder.create(loc, reduc); } // Set the insertion point to matched branch. builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); crd = trans; } assert(crd); coords[tid][srcLvl] = crd; // NOTE: we can also prepare for next level here in advance // Push the loop into stack loopStack.emplace_back(ArrayRef(tid), ArrayRef(srcLvl), loop, builder.getInsertionBlock(), crd, loopTag); // Emit extra locals. emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tids, lvls); return loop; } Operation *LoopEmitter::enterFilterLoopOverTensorAtLvl( OpBuilder &builder, Location loc, TensorId tid, Level lvl, AffineExpr affine, MutableArrayRef reduc) { assert(tid < lvlTypes.size() && lvl < lvlTypes[tid].size()); assert(!affine.isa() && !isDenseDLT(lvlTypes[tid][lvl])); // We can not re-enter the same level. assert(!coords[tid][lvl]); // TODO: We should instead use a whileOp for filter loop to allow early // break when exceeding (for ordered levels). // TODO: There are many other potiential opportunities that we might apply in // the future. E.g., we could use binary search to locate positions. const Value step = constantIndex(builder, loc, 1); const Value pLo = posits[tid][lvl]; const Value pHi = highs[tid][lvl]; scf::ForOp forOp = builder.create(loc, pLo, pHi, step, reduc); // In-place update on the reduction variable vector. assert(forOp.getNumRegionIterArgs() == reduc.size()); for (int i = 0, e = reduc.size(); i < e; i++) reduc[i] = forOp.getRegionIterArg(i); builder.setInsertionPointToStart(forOp.getBody()); // The induction variable gives the position. const Value pos = forOp.getInductionVar(); posits[tid][lvl] = pos; // Generating a load on the coordinates array yields the crd. const Value mem = coordinatesBuffers[tid][lvl]; const Value crd = genIndexLoad(builder, loc, mem, pos); coords[tid][lvl] = crd; // Generate an if-condition to filter out coordinates that are not // equal to the result of the affine expression. Value expected = genAffine(builder, loc, affine); auto pred = builder.create(loc, arith::CmpIPredicate::eq, crd, expected); SmallVector types; for (Value red : reduc) { types.push_back(red.getType()); } bool hasReduc = !types.empty(); scf::IfOp ifOp = builder.create(loc, types, pred, /*else*/ hasReduc); if (hasReduc) { // scf.for (a) -> v // %s = scf.if (a) -> v // user-generated code. // else // yield a // yield %s builder.create(loc, ifOp.getResults()); builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); // On mismatch. builder.create(loc, reduc); } // Set the insert point to matched branch. builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); // NOTE: we can also prepare for next lvl here in advance // Push the loop into stack loopStack.emplace_back(ArrayRef(tid), ArrayRef(lvl), forOp, builder.getInsertionBlock(), crd, nullptr); return forOp; } void LoopEmitter::genDenseAffineAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl, AffineExpr lvlExpr) { assert(isDenseDLT(lvlTypes[tid][lvl])); // For dense levels, the level-coordinate also serves as the position. Value lvlCrd = genAffine(builder, loc, lvlExpr); posits[tid][lvl] = genAddress(builder, loc, tid, lvl, lvlCrd); } Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls, bool needsUniv, MutableArrayRef reduc) { assert(tids.size() == lvls.size()); SmallVector types; SmallVector operands; // Construct the while-loop with a parameter for each coordinate. const Type indexType = builder.getIndexType(); for (auto [tid, lvl] : llvm::zip(tids, lvls)) { const auto lvlTp = lvlTypes[tid][lvl]; if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { const auto reassoc = getCollapseReassociation(tid, lvl); for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) { // This is the segment high for each non-unique levels. types.push_back(indexType); operands.push_back(constantIndex(builder, loc, 0)); } } const auto pos = posits[tid][reassoc.front()]; assert(pos); types.push_back(indexType); operands.push_back(pos); } } // The position where user-supplied reduction variable starts. for (Value rec : reduc) { types.push_back(rec.getType()); operands.push_back(rec); } if (needsUniv) { types.push_back(indexType); // Update universal index. operands.push_back(loopSeqStack.back()); } assert(types.size() == operands.size()); scf::WhileOp whileOp = builder.create(loc, types, operands); SmallVector locs(types.size(), loc); Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); // Build the "before" region, which effectively consists // of a conjunction of "i < upper" tests on all induction. builder.setInsertionPointToStart(&whileOp.getBefore().front()); Value cond; unsigned o = 0; for (auto [t, lvl] : llvm::zip(tids, lvls)) { unsigned tid = t; // Why `t` can not be captured by lambda? const auto lvlTp = lvlTypes[tid][lvl]; if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { const auto reassoc = getCollapseReassociation(tid, lvl); assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) { // Links the SSA chain for segHi. segHi[tid][reassoc[i]] = after->getArgument(o++); } } Value op1 = before->getArgument(o); // We used the first level bound as the bound the collapsed set of levels. Value op2 = highs[tid][reassoc.front()]; Value opc = builder.create(loc, arith::CmpIPredicate::ult, op1, op2); cond = cond ? builder.create(loc, cond, opc) : opc; // Update positions Value pos = after->getArgument(o++); // For COO, the position is the same across consecutive levels. /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. llvm::for_each(reassoc, [this, tid, pos](Level srcLvl) { posits[tid][srcLvl] = pos; }); } } builder.create(loc, cond, before->getArguments()); // Generates while body. builder.setInsertionPointToStart(&whileOp.getAfter().front()); SmallVector> slicesPreds; unsigned i = 0; for (auto [tid, lvl] : llvm::zip(tids, lvls)) { // Prepares for next level. const auto lvlTp = lvlTypes[tid][lvl]; if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { coords[tid][lvl] = genSparseCrd(builder, loc, tid, lvl); if (isSparseSlices[tid]) { auto [trans, pred] = genSliceLegitPredicate(builder, loc, coords[tid][lvl], tid, lvl); slicesPreds.emplace_back(pred, i); // Updates to the relative coordinate to the slice. coords[tid][lvl] = trans; } i++; } } if (!slicesPreds.empty()) { // Skips invalid loop iteration when slice coordinate is inapplicable. SmallVector yields(after->getArguments()); // Generates a list of if statments // pos = in_slice ? pos : pos + 1 // TODO: instead of always picking pos + 1, we should set pos = high to // break to loop if the coordinates are larger than the slice size. // // This "idx" is the index into `llvm::zip(tids, lvls)` for (auto [pred, idx] : slicesPreds) { Value nextPos = builder.create( loc, yields[idx], constantIndex(builder, loc, 1)); yields[idx] = builder.create(loc, pred, yields[idx], nextPos); } Value pred = slicesPreds.front().first; for (int i = 1, e = slicesPreds.size(); i < e; i++) { pred = builder.create(loc, pred, slicesPreds[i].first); } auto ifOp = builder.create(loc, types, pred, /*else*/ true); ifOp->setAttr(getLoopEmitterLoopAttrName(), StringAttr::get(builder.getContext(), "slice")); builder.create(loc, ifOp->getResults()); assert(types.size() == yields.size()); // If not all slices are legit builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); builder.create(loc, yields); // If all slices are legit, start the user generated code. builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); } Value min; // Finds the minimum coordinate if (!needsUniv) { for (auto [tid, lvl] : llvm::zip(tids, lvls)) { const auto lvlTp = lvlTypes[tid][lvl]; if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { const auto crd = coords[tid][lvl]; if (min) { Value cmp = builder.create( loc, arith::CmpIPredicate::ult, crd, min); min = builder.create(loc, cmp, crd, min); } else { min = crd; } } } } else { assert(!min); // Otherwise, universal index is the minimal pos. min = after->getArguments().back(); } // Sets up the loop stack. loopStack.emplace_back(tids, lvls, whileOp, builder.getInsertionBlock(), min, loopTag); assert(loopStack.size() == loopSeqStack.size()); for (auto [tid, dstLvl] : llvm::zip(tids, lvls)) { const auto reassoc = getCollapseReassociation(tid, dstLvl); assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); // TODO: Refactors this into smaller functions. // NOTE: For all the collapsed level (except for the last one, that is why // the loop ends with `reassoc.size() - 1`), as each iteration is advanced // by the segment size of the last level, which does not always invalidate // the segment size for the previous levels, thus we need to propagate the // segment sizes across loop iterations and only forward if needed. // // E.g., for a COO tensor with the following coordinates array. // (0, 0, 1), // (0, 0, 2), // (1, 1, 1), // segHi[lvl=0] = segHi[lvl=1] = 2 // segHi[lvl=2] = 1, // the first iteration does not invalidate segHi[0] and segHi[1] for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { const Level srcLvl = reassoc[i]; if (!isUniqueDLT(lvlTypes[tid][srcLvl])) { const Value pos = posits[tid][srcLvl]; const auto oldSegHi = segHi[tid][srcLvl]; assert(oldSegHi); Value newSegHi = builder.create( loc, arith::CmpIPredicate::uge, pos, oldSegHi); auto ifNewSegHi = builder.create(loc, builder.getIndexType(), newSegHi, true); { OpBuilder::InsertionGuard guard(builder); builder.setInsertionPointToStart(ifNewSegHi.thenBlock()); builder.create(loc, genSegmentHigh(builder, loc, tid, srcLvl, pos, highs[tid][srcLvl])); // Else, resues the same segment high. builder.setInsertionPointToStart(ifNewSegHi.elseBlock()); builder.create(loc, oldSegHi); } highs[tid][srcLvl + 1] = segHi[tid][srcLvl] = ifNewSegHi.getResult(0); } }; const auto srcLvl = reassoc.back(); if (!isUniqueDLT(lvlTypes[tid][srcLvl])) { segHi[tid][srcLvl] = genSegmentHigh( builder, loc, tid, srcLvl, posits[tid][srcLvl], highs[tid][srcLvl]); } } // Emits extra locals emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tids, lvls); // Updates reduction variables assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0)); // In-place update on reduction variable. for (unsigned i = 0, e = reduc.size(); i < e; i++) reduc[i] = after->getArgument(o + i); return whileOp; } void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc, TensorId tid, Level dstLvl) { assert(tid < lvlTypes.size() && dstLvl < lvlTypes[tid].size()); const auto lvlTp = lvlTypes[tid][dstLvl]; if (isDenseDLT(lvlTp)) return; const Value c0 = constantIndex(builder, loc, 0); const Value c1 = constantIndex(builder, loc, 1); for (const Level srcLvl : getCollapseReassociation(tid, dstLvl)) { // Either the first level, or the previous level has been set. /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. assert(srcLvl == 0 || posits[tid][srcLvl - 1]); if (!isCompressedDLT(lvlTp) && !isSingletonDLT(lvlTp)) continue; if (isCompressedDLT(lvlTp)) { const Value mem = positionsBuffers[tid][srcLvl]; const Value pLo = srcLvl == 0 ? c0 : posits[tid][srcLvl - 1]; posits[tid][srcLvl] = genIndexLoad(builder, loc, mem, pLo); const Value pHi = builder.create(loc, pLo, c1); highs[tid][srcLvl] = genIndexLoad(builder, loc, mem, pHi); return; } if (isSingletonDLT(lvlTp)) { const Value pLo = srcLvl == 0 ? c0 : posits[tid][srcLvl - 1]; posits[tid][srcLvl] = pLo; // If we are coiterating non-unique levels, then use pHi=segHi; // otherwise use pHi=pLo+1. // NOTE: Just because the level is non-unique, that does not // guarantee that segHi is defined: because we only generate segHi // whenever coiterating, in order to improve code quality for the // non-coiterating cases. const auto theSegHi = segHi[tid][srcLvl - 1]; highs[tid][srcLvl] = (!isUniqueDLT(lvlTypes[tid][srcLvl - 1]) && theSegHi) ? theSegHi : builder.create(loc, pLo, c1); return; } } llvm_unreachable("Unrecognized level-type!"); } void LoopEmitter::emitExtraLocalsForTensorsAtDenseLvls(OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls) { // Initialize dense positions. Note that we generate dense coordinates of the // output tensor unconditionally, since they may not appear in the lattice, // but may be needed for linearized codegen. assert(tids.size() == lvls.size()); for (auto [tid, lvl] : llvm::zip(tids, lvls)) { if (isDenseDLT(lvlTypes[tid][lvl])) { auto enc = getSparseTensorEncoding(tensors[tid].getType()); if (enc && !isSparseOutput(tid)) { bool validPos = lvl == 0 || posits[tid][lvl - 1]; if (!validPos) { // We might not find the pos for the sparse output tensor as it is // unconditionally required by the sparsification. assert(isOutputTensor(tid)); continue; } posits[tid][lvl] = genAddress(builder, loc, tid, lvl, loopStack.back().iv); // NOTE: we can also prepare for next lvl here in advance } } } } void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, MutableArrayRef reduc) { const LoopInfo &loopInfo = loopStack.back(); rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock); if (auto forOp = llvm::dyn_cast(loopInfo.loop)) { if (!reduc.empty()) { assert(reduc.size() == forOp.getNumResults()); rewriter.create(loc, reduc); } // Exit the loop. rewriter.setInsertionPointAfter(forOp); // In-place update reduction variables. for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++) reduc[i] = forOp.getResult(i); } else { auto parOp = llvm::cast(loopInfo.loop); if (!reduc.empty()) { assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1); Operation *redExp = reduc.front().getDefiningOp(); // Reduction expression should have no use. assert(redExp->getUses().empty()); // This must be a binary operation. // NOTE: This is users' responsibilty to ensure the operation are // commutative. assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1); Value redVal = parOp.getInitVals().front(); Value curVal; if (redExp->getOperand(0) == redVal) curVal = redExp->getOperand(1); else if (redExp->getOperand(1) == redVal) curVal = redExp->getOperand(0); // One of the operands must be the init value (which is also the // previous reduction value). assert(curVal); #ifndef NDEBUG // The reduction expression should be the only user of the reduction val // inside the parallel for. unsigned numUsers = 0; for (Operation *op : redVal.getUsers()) { if (op->getParentOp() == parOp) numUsers++; } assert(numUsers == 1); #endif // NDEBUG rewriter.setInsertionPointAfter(redExp); auto redOp = rewriter.create(loc, curVal); // Attach to the reduction op. Block *redBlock = &redOp.getRegion().getBlocks().front(); rewriter.setInsertionPointToEnd(redBlock); Operation *newRed = rewriter.clone(*redExp); // Replaces arguments of the reduction expression by using the block // arguments from scf.reduce. rewriter.updateRootInPlace( newRed, [&]() { newRed->setOperands(redBlock->getArguments()); }); // Erases the out-dated reduction expression. rewriter.eraseOp(redExp); rewriter.setInsertionPointToEnd(redBlock); rewriter.create(loc, newRed->getResult(0)); } rewriter.setInsertionPointAfter(parOp); // In-place update reduction variables. for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++) reduc[i] = parOp.getResult(i); } // Finished iterating a tensor, clean up // We only do the clean up on for loop as while loops do not necessarily // finish the iteration on a sparse tensor for (auto [tid, lvl] : llvm::zip(loopInfo.tids, loopInfo.lvls)) { // Reset to null. coords[tid][lvl] = Value(); posits[tid][lvl] = Value(); // Dense level, high is fixed. if (!isDenseDLT(lvlTypes[tid][lvl])) highs[tid][lvl] = Value(); } } void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc, MutableArrayRef reduc) { const LoopInfo &loopInfo = loopStack.back(); auto whileOp = llvm::cast(loopInfo.loop); builder.setInsertionPointToEnd(loopInfo.userCodeBlock); Value iv = loopInfo.iv; // Finalize the induction. Note that the induction could be performed // in the individual if-branches to avoid re-evaluating the conditions. // However, that would result in a rather elaborate forest of yield // instructions during code generation. Moreover, performing the induction // after the if-statements more closely resembles code generated by TACO. unsigned o = 0; SmallVector operands; Value one = constantIndex(builder, loc, 1); for (auto [tid, dstLvl] : llvm::zip(loopInfo.tids, loopInfo.lvls)) { const auto lvlTp = lvlTypes[tid][dstLvl]; if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp)) { const auto reassoc = getCollapseReassociation(tid, dstLvl); assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { const Level srcLvl = reassoc[i]; if (!isUniqueDLT(lvlTypes[tid][srcLvl])) { operands.push_back(segHi[tid][srcLvl]); o++; } } const Value crd = coords[tid][dstLvl]; const Value pos = posits[tid][dstLvl]; Value cmp = builder.create(loc, arith::CmpIPredicate::eq, crd, iv); // If the loop contains a coiteration with non-unique level, we fast // forward all the duplicated coords by setting the position to the // segment high. Value add = !isUniqueDLT(lvlTypes[tid][reassoc.back()]) ? segHi[tid][reassoc.back()] : builder.create(loc, pos, one); operands.push_back(builder.create(loc, cmp, add, pos)); // Following loops continue iteration from the break point of the // current while loop. const Value newPos = whileOp->getResult(o++); // We need to define a new local variable for `tid` to avoid // warnings about "captured structured bindings are a C++20 extension". // FIXME(wrengr): define a helper function to capture this idiom! const TensorId newTid = tid; llvm::for_each(reassoc, [this, newTid, newPos](Level srcLvl) { posits[newTid][srcLvl] = newPos; }); // The coordinate is invalid now. coords[tid][dstLvl] = nullptr; // The segment high is invalid now. segHi[tid][dstLvl] = nullptr; // highs remains unchanged. } } // Reduction value from users. for (auto &i : reduc) { operands.push_back(i); // In place update reduction variable. i = whileOp->getResult(o++); } // An (optional) universal index. if (operands.size() < whileOp.getNumResults()) { assert(operands.size() + 1 == whileOp.getNumResults()); // The last one is the universial index. operands.push_back(builder.create(loc, iv, one)); // update the loop starting point of current loop sequence loopSeqStack.back() = whileOp->getResult(o++); } assert(o == operands.size()); builder.create(loc, operands); builder.setInsertionPointAfter(whileOp); } void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc, MutableArrayRef reduc) { // Clean up the values, it would help use to discover potential bug at a // earlier stage (instead of silently using a wrong value). const LoopInfo &loopInfo = loopStack.back(); assert(loopInfo.tids.size() == loopInfo.lvls.size()); SmallVector red; if (llvm::isa(loopInfo.loop)) { exitWhileLoop(rewriter, loc, reduc); } else { exitForLoop(rewriter, loc, reduc); } assert(loopStack.size() == loopSeqStack.size()); loopStack.pop_back(); }