//===- LoopEmitter.cpp ----------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "LoopEmitter.h" #include "CodegenUtils.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" using namespace mlir; using namespace mlir::sparse_tensor; //===----------------------------------------------------------------------===// // File local helper functions. //===----------------------------------------------------------------------===// /// Generates a position/coordinate load from the sparse storage scheme. /// Narrower data types need to be zero extended before casting the /// value into the `Index` type used for looping and indexing. static Value genIndexLoad(OpBuilder &builder, Location loc, Value mem, Value s) { // For the scalar case, we simply zero extend narrower indices into 64-bit // values before casting to index without a performance penalty. Here too, // however, indices that already are 64-bit, in theory, cannot express the // full range as explained above. Value load = builder.create(loc, mem, s); if (!load.getType().isa()) { if (load.getType().getIntOrFloatBitWidth() < 64) load = builder.create(loc, builder.getI64Type(), load); load = builder.create(loc, builder.getIndexType(), load); } return load; } // TODO: Support dynamic sized slice. static Value getSliceOffset(OpBuilder &builder, Location loc, SparseTensorEncodingAttr enc, unsigned lvl) { return constantIndex(builder, loc, *enc.getStaticLvlSliceOffset(lvl)); } static Value getSliceSize(OpBuilder &builder, Location loc, SparseTensorEncodingAttr enc, unsigned lvl) { return constantIndex(builder, loc, *enc.getStaticLvlSliceSize(lvl)); } static Value getSliceStride(OpBuilder &builder, Location loc, SparseTensorEncodingAttr enc, unsigned lvl) { return constantIndex(builder, loc, *enc.getStaticLvlSliceStride(lvl)); } // Converts a coordinate relative to the slice to the coordinate relative // to the underlying tensor. static Value toSliceCoord(OpBuilder &builder, Location loc, Value v, SparseTensorEncodingAttr enc, unsigned lvl) { Value stride = getSliceStride(builder, loc, enc, lvl); Value offset = getSliceOffset(builder, loc, enc, lvl); // iv = iv * stride + offset v = builder.create(loc, v, stride); v = builder.create(loc, v, offset); return v; } // Converts a coordinate relative to the underlying tensor to the coordinate // relative to the slice, returns a extra reminder value static std::pair fromSliceCrd(OpBuilder &builder, Location loc, Value v, SparseTensorEncodingAttr enc, unsigned lvl) { Value stride = getSliceStride(builder, loc, enc, lvl); Value offset = getSliceOffset(builder, loc, enc, lvl); // iv = (iv - offset) / stride v = builder.create(loc, v, offset); Value rem = builder.create(loc, v, stride); v = builder.create(loc, v, stride); return std::make_pair(v, rem); } static std::pair genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd, SparseTensorEncodingAttr enc, unsigned lvl) { std::pair trans = fromSliceCrd(builder, loc, crd, enc, lvl); // First, crd >= offset (TODO: seems unsigned >= 0 won't be folded, skip // the check if the offset is zero). auto geOffset = builder.create(loc, arith::CmpIPredicate::uge, crd, getSliceOffset(builder, loc, enc, lvl)); // Second, coord_in_slice < length auto ltLength = builder.create(loc, arith::CmpIPredicate::ult, trans.first, getSliceSize(builder, loc, enc, lvl)); // Third, rem == 0; confirmed that (a % 1) will be folded to 0 auto fitStride = builder.create(loc, arith::CmpIPredicate::eq, trans.second, constantIndex(builder, loc, 0)); auto pred = builder.create(loc, geOffset, ltLength); pred = builder.create(loc, pred, fitStride); return {trans.first, pred}; } //===----------------------------------------------------------------------===// // Sparse tensor loop emitter class implementations //===----------------------------------------------------------------------===// Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, size_t tid, size_t dim, Value iv) { Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1]; Value mul = builder.create(loc, highs[tid][dim], p); if (isSparseSlices[tid]) { auto enc = getSparseTensorEncoding(tensors[tid].getType()); iv = toSliceCoord(builder, loc, iv, enc, dim); } Value add = builder.create(loc, mul, iv); return add; } Value LoopEmitter::genSegmentHigh(OpBuilder &builder, Location loc, size_t tid, size_t lvl, Value pos, Value pHi) { Value prevCrd = genIndexLoad(builder, loc, crdBuffer[tid][lvl], pos); // De-duplicates repeated elements. // // while (pos < pHi && coord[pos] == prev_coord) // pos++; // return pos; auto whileOp = builder.create( loc, builder.getIndexType(), pos, /*beforeBuilder=*/ [this, tid, lvl, pHi, prevCrd](OpBuilder &builder, Location loc, ValueRange ivs) { Value inBound = builder.create( loc, arith::CmpIPredicate::ult, ivs[0], pHi); auto ifOp = builder.create(loc, builder.getI1Type(), inBound, true); { OpBuilder::InsertionGuard guard(builder); // Load the next coordinates only when inbound (to avoid OOB // acccesses). builder.setInsertionPointToStart(ifOp.thenBlock()); Value nxCrd = genIndexLoad(builder, loc, crdBuffer[tid][lvl], ivs[0]); Value cont = builder.create( loc, arith::CmpIPredicate::eq, nxCrd, prevCrd); builder.create(loc, cont); // Else, the position is out of bound, yield false to terminate the // loop. builder.setInsertionPointToStart(ifOp.elseBlock()); builder.create(loc, constantI1(builder, loc, false)); } builder.create(loc, ifOp.getResults()[0], ivs); }, /*afterBuilder=*/ [](OpBuilder &builder, Location loc, ValueRange ivs) { // pos ++ Value nxPos = builder.create( loc, ivs[0], constantIndex(builder, loc, 1)); builder.create(loc, nxPos); }); // Return the segment high. return whileOp.getResult(0); } Value LoopEmitter::genSparseCrd(OpBuilder &builder, Location loc, size_t tid, size_t dstLvl) { Value crd = constantIndex(builder, loc, 0); const auto reassoc = getCollapseReassociation(tid, dstLvl); for (unsigned i = 0; i < reassoc.size(); i++) { const auto srcLvl = reassoc[i]; // A load on the coordinates array yields the coordinate. const Value mem = crdBuffer[tid][srcLvl]; const Value pos = pidxs[tid][dstLvl]; const Value off = genIndexLoad(builder, loc, mem, pos); // Linearized the coordinates within the same collapse reassociation. crd = builder.create(loc, crd, off); if (i != reassoc.size() - 1) { crd = builder.create(loc, crd, this->lvlSizes[tid][reassoc[i + 1]]); } } return crd; } LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput, bool isSparseOut, ArrayRef topSort) { initialize(tensors, loopTag, hasOutput, isSparseOut, topSort); } void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput, bool isSparseOut, ArrayRef topSort) { // First initializes fields. this->loopTag = loopTag; this->hasOutput = hasOutput; this->isSparseOut = isSparseOut; this->tensors.assign(ts.begin(), ts.end()); this->isSparseSlices.assign(tensors.size(), false); this->dimTypes.assign(tensors.size(), std::vector()); this->pidxs.assign(tensors.size(), std::vector()); this->segHi.assign(tensors.size(), std::vector()); this->coord.assign(tensors.size(), std::vector()); this->highs.assign(tensors.size(), std::vector()); this->lvlSizes.assign(tensors.size(), std::vector()); this->posBuffer.assign(tensors.size(), std::vector()); this->crdBuffer.assign(tensors.size(), std::vector()); this->valBuffer.assign(tensors.size(), nullptr); this->loopStack.reserve(topSort.size()); this->sparsiferLoopLvlMap.assign(topSort.size(), 0); this->collapseReassoc.assign(tensors.size(), nullptr); for (size_t tid = 0, e = tensors.size(); tid < e; tid++) { auto t = tensors[tid]; // a scalar or 0-dimension tensors if (isZeroRankedTensorOrScalar(t.getType())) continue; auto rtp = getRankedTensorType(t); if (auto reshape = t.getDefiningOp(); isUniqueCOOType(rtp) && reshape) { // TODO: Supports more kinds of sparse tensors. // FIXME: We should instead lower reshape operations on sparse tensors to // view change. collapseReassoc[tid] = reshape.getReassociation(); rtp = reshape.getSrcType(); // Overwrites the tensor to the source tensor of reshape operations. tensors[tid] = t = reshape.getSrc(); } auto rank = static_cast(rtp.getRank()); auto enc = getSparseTensorEncoding(rtp); // We always treat sparse output tensor as dense so that we always iterate // it based on dim size. if (enc && !(isOutputTensor(tid) && isSparseOut)) { isSparseSlices[tid] = enc.isSlice(); for (auto dimTp : enc.getDimLevelType()) dimTypes[tid].push_back(dimTp); } else dimTypes[tid].assign(rank, DimLevelType::Dense); // Initialize using empty value. pidxs[tid].assign(rank, Value()); segHi[tid].assign(rank, Value()); coord[tid].assign(rank, Value()); highs[tid].assign(rank, Value()); lvlSizes[tid].assign(rank, Value()); posBuffer[tid].assign(rank, Value()); crdBuffer[tid].assign(rank, Value()); } // FIXME: This map should be maintained outside loop emitter. for (unsigned i = 0, e = topSort.size(); i < e; i++) { // This is an inverse map of the topologically sorted loop index from // sparsifier. This is needed to map the AffineDimExpr back to the loopStack // index used in loop emitter. sparsiferLoopLvlMap[topSort[i]] = i; } } void LoopEmitter::initializeLoopEmit(OpBuilder &builder, Location loc, LoopEmitter::OutputUpdater updater) { // For every tensor, find lower and upper bound on dimensions, set the // same bounds on loop indices, and obtain dense or sparse buffer(s). for (size_t t = 0, e = tensors.size(); t < e; t++) { const auto tensor = tensors[t]; const auto rtp = tensor.getType().dyn_cast(); if (!rtp) // Skips only scalar, zero ranked tensor still need to be bufferized and // (probably) filled with zeros by users. continue; // FIXME: the definition of `lvlRank` looks more like a dim-rank; // but the variable is used as a level everywhere below, which // suggests there may be some dim/lvl confusion going on here. const Level lvlRank = rtp.getRank(); const auto shape = rtp.getShape(); const auto enc = getSparseTensorEncoding(rtp); const Level cooStart = enc ? getCOOStart(enc) : lvlRank; // Scan all levels of current tensor. for (Level l = 0; l < lvlRank; l++) { // This should be called only once at beginning. assert(!posBuffer[t][l] && !crdBuffer[t][l] && !highs[t][l]); const auto dlt = dimTypes[t][l]; // Handle sparse storage schemes. if (isCompressedDLT(dlt)) { // Generate sparse primitives to obtains positions and coordinates. posBuffer[t][l] = genToPositions(builder, loc, tensor, l); crdBuffer[t][l] = genToCoordinates(builder, loc, tensor, l, cooStart); } else if (isSingletonDLT(dlt)) { // Singleton level, fetch coordinates. crdBuffer[t][l] = genToCoordinates(builder, loc, tensor, l, cooStart); } else { // Dense level, nothing to fetch. assert(isDenseDLT(dlt)); } // Find upper bound in current dimension. // FIXME: `toOrigDim` is deprecated const Dimension d = toOrigDim(enc, l); lvlSizes[t][l] = highs[t][l] = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, d); } // Perform the required bufferization. Dense inputs materialize // from the input tensors. Sparse inputs use sparse primitives to obtain the // values. // Delegates extra output initialization to clients. bool isOutput = isOutputTensor(t); Type elementType = rtp.getElementType(); if (!enc) { // Non-annotated dense tensors. BaseMemRefType denseTp = MemRefType::get(shape, elementType); // TODO: if we unconditionally use fully dynamic layout here, it breaks // some vectorization passes which requires static stride = 1. // Is it possible to call vectorization pass after bufferization? if (llvm::isa_and_nonnull(tensor.getDefiningOp())) denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp); Value denseVal = builder.create(loc, denseTp, tensor); // Dense outputs need special handling. if (isOutput && updater) denseVal = updater(builder, loc, denseVal, tensor); valBuffer[t] = denseVal; } else { // Annotated sparse tensors. // We also need the value buffer for annotated all dense `sparse` tensor. valBuffer[t] = genToValues(builder, loc, tensor); } // NOTE: we can also prepare for 0 dim here in advance, this will hosit // some loop preparation from tensor iteration, but will also (undesirably) // hosit the code ouside if conditions. } } void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims) { // TODO: sort assert(loopSeqStack.size() == loopStack.size()); // Universal Index starts from 0. loopSeqStack.emplace_back(constantIndex(builder, loc, 0)); // Prepares for all the tensors used in the current loop sequence. for (auto [tid, dim] : llvm::zip(tids, dims)) prepareLoopOverTensorAtDim(builder, loc, tid, dim); } Value LoopEmitter::genAffine(OpBuilder &builder, AffineExpr a, Location loc) { switch (a.getKind()) { case AffineExprKind::DimId: { unsigned idx = a.cast().getPosition(); return loopStack[sparsiferLoopLvlMap[idx]].iv; } case AffineExprKind::Add: { auto binOp = a.cast(); return builder.create( loc, genAffine(builder, binOp.getLHS(), loc), genAffine(builder, binOp.getRHS(), loc)); } case AffineExprKind::Mul: { auto binOp = a.cast(); return builder.create( loc, genAffine(builder, binOp.getLHS(), loc), genAffine(builder, binOp.getRHS(), loc)); } case AffineExprKind::Constant: { int64_t c = a.cast().getValue(); return constantIndex(builder, loc, c); } default: llvm_unreachable("unexpected affine subscript"); } } Operation *LoopEmitter::enterLoopOverTensorAtDim( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims, MutableArrayRef reduc, bool isParallel) { // TODO: support multiple return on parallel for? assert(!isParallel || reduc.size() <= 1); bool isSparseInput = false; size_t tid = tids.front(), dim = dims.front(); for (auto [t, d] : llvm::zip(tids, dims)) { assert(dimTypes[t].size() > d); // Must be a valid tid, dim pair assert(!coord[t][d]); // We cannot re-enter the same level auto dimType = dimTypes[t][d]; // Must be a recognizable DLT. assert(isDenseDLT(dimType) || isCompressedDLT(dimType) || isSingletonDLT(dimType)); bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType); // We can at most have one sparse input, otherwise, a while loop is required // to co-iterate multiple sparse tensors. assert(!isSparseInput || !isSparse); if (isSparse) { tid = t; dim = d; } isSparseInput = isSparseInput || isSparse; } auto enc = getSparseTensorEncoding(tensors[tid].getType()); const auto reassoc = getCollapseReassociation(tid, dim); // TODO: support dynamic slices. // Uses the first dimension here to build the loop bound (which is also the // biggest range). const auto fdim = reassoc.front(); Value step = constantIndex(builder, loc, 1); Value lo = isSparseInput ? pidxs[tid][fdim] // current offset : loopSeqStack.back(); // universal index Value hi = highs[tid][fdim]; Operation *loop = nullptr; Value iv; if (isParallel) { assert(collapseReassoc[tid] == nullptr); scf::ParallelOp parOp = builder.create(loc, lo, hi, step, reduc); builder.setInsertionPointToStart(parOp.getBody()); assert(parOp.getNumReductions() == reduc.size()); iv = parOp.getInductionVars()[0]; // In-place update on the reduction variable vector. // Note that the init vals is not the actual reduction variables but instead // used as a `special handle` to (temporarily) represent them. The // expression on init vals will be moved into scf.reduce and replaced with // the block arguments when exiting the loop (see exitForLoop). This is // needed as we can not build the actual reduction block and get the actual // reduction varaible before users fill parallel loop body. for (int i = 0, e = reduc.size(); i < e; i++) reduc[i] = parOp.getInitVals()[i]; loop = parOp; } else { scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); builder.setInsertionPointToStart(forOp.getBody()); iv = forOp.getInductionVar(); // In-place update on the reduction variable vector. assert(forOp.getNumRegionIterArgs() == reduc.size()); for (int i = 0, e = reduc.size(); i < e; i++) reduc[i] = forOp.getRegionIterArg(i); loop = forOp; } assert(loop && iv); Value crd; if (isSparseInput) { assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); // For COO, the position is the same across consecutive levels. llvm::for_each(reassoc, [this, tid, iv](Level lvl) { pidxs[tid][lvl] = iv; }); crd = genSparseCrd(builder, loc, tid, dim); } else { // Dense tensor, the coordinate is the inducation variable. crd = iv; } if (isSparseSlices[tid] && isSparseInput) { // For sparse level slices, we need to filter out invalid coordinates that // are not included in the slice. SmallVector types; for (Value red : reduc) types.push_back(red.getType()); auto [trans, pred] = genSliceLegitPredicate(builder, loc, crd, enc, dim); bool hasReduc = !types.empty(); scf::IfOp ifOp = builder.create(loc, types, pred, /*else*/ hasReduc); if (hasReduc) { // scf.for (a) -> v // %s = scf.if (a) -> v // user-generated code. // else // yield a // yield %s builder.create(loc, ifOp.getResults()); builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); // On mismatch. builder.create(loc, reduc); } // Set the insertion point to matched branch. builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); crd = trans; } assert(crd); coord[tid][dim] = crd; // NOTE: we can also prepare for next dim here in advance // Push the loop into stack loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), loop, builder.getInsertionBlock(), coord[tid][dim], loopTag); // Emit extra locals. emitExtraLocalsForTensorsAtDenseDims(builder, loc, tids, dims); return loop; } Operation *LoopEmitter::enterFilterLoopOverTensorAtDim( OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine, MutableArrayRef reduc) { assert(!affine.isa() && !isDenseDLT(dimTypes[tid][dim])); assert(dimTypes[tid].size() > dim); // We can not re-enter the same level. assert(!coord[tid][dim]); Value step = constantIndex(builder, loc, 1); Value lo = pidxs[tid][dim]; Value hi = highs[tid][dim]; // TODO: We should instead use a whileOp for filter loop to allow early // break when exceeding (for ordered dimensions). // TODO: There are many other potiential opportunities that we might apply in // the future. E.g., we could use binary search to located the position index. scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); // In-place update on the reduction variable vector. assert(forOp.getNumRegionIterArgs() == reduc.size()); for (int i = 0, e = reduc.size(); i < e; i++) reduc[i] = forOp.getRegionIterArg(i); builder.setInsertionPointToStart(forOp.getBody()); Value iv = forOp.getInductionVar(); pidxs[tid][dim] = iv; // Generating a load on the coordinates array yields the coordinate. Value mem = crdBuffer[tid][dim]; coord[tid][dim] = genIndexLoad(builder, loc, mem, iv); // Generate an if-condition to filter out coordinates that are not // equal to the result of the affine expression. Value expected = genAffine(builder, affine, loc); auto pred = builder.create(loc, arith::CmpIPredicate::eq, coord[tid][dim], expected); SmallVector types; for (Value red : reduc) { types.push_back(red.getType()); } bool hasReduc = !types.empty(); scf::IfOp ifOp = builder.create(loc, types, pred, /*else*/ hasReduc); if (hasReduc) { // scf.for (a) -> v // %s = scf.if (a) -> v // user-generated code. // else // yield a // yield %s builder.create(loc, ifOp.getResults()); builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); // On mismatch. builder.create(loc, reduc); } // Set the insert point to matched branch. builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); // NOTE: we can also prepare for next dim here in advance // Push the loop into stack loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), forOp, builder.getInsertionBlock(), coord[tid][dim], nullptr); return forOp; } void LoopEmitter::genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine) { Value affineV = genAffine(builder, affine, loc); pidxs[tid][dim] = genAddress(builder, loc, tid, dim, affineV); } Operation *LoopEmitter::enterCoIterationOverTensorsAtDims( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims, bool needsUniv, MutableArrayRef reduc) { assert(tids.size() == dims.size()); SmallVector types; SmallVector operands; // Construct the while-loop with a parameter for each coordinate. Type indexType = builder.getIndexType(); for (auto [tid, dim] : llvm::zip(tids, dims)) { if (isCompressedDLT(dimTypes[tid][dim]) || isSingletonDLT(dimTypes[tid][dim])) { const auto reassoc = getCollapseReassociation(tid, dim); for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { if (!isUniqueDLT(dimTypes[tid][reassoc[i]])) { // This is the segment high for each non-unique levels. types.push_back(indexType); operands.push_back(constantIndex(builder, loc, 0)); } } assert(pidxs[tid][dim]); types.push_back(indexType); operands.push_back(pidxs[tid][reassoc.front()]); } } // The position where user-supplied reduction variable starts. for (Value rec : reduc) { types.push_back(rec.getType()); operands.push_back(rec); } if (needsUniv) { types.push_back(indexType); // Update universal index. operands.push_back(loopSeqStack.back()); } assert(types.size() == operands.size()); scf::WhileOp whileOp = builder.create(loc, types, operands); SmallVector locs(types.size(), loc); Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); // Build the "before" region, which effectively consists // of a conjunction of "i < upper" tests on all induction. builder.setInsertionPointToStart(&whileOp.getBefore().front()); Value cond; unsigned o = 0; for (auto [t, lvl] : llvm::zip(tids, dims)) { unsigned tid = t; // Why `t` can not be captured by lambda? if (isCompressedDLT(dimTypes[tid][lvl]) || isSingletonDLT(dimTypes[tid][lvl])) { const auto reassoc = getCollapseReassociation(tid, lvl); assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { if (!isUniqueDLT(dimTypes[tid][reassoc[i]])) { // Links the SSA chain for segHi. segHi[tid][reassoc[i]] = after->getArgument(o++); } } Value op1 = before->getArgument(o); // We used the first level bound as the bound the collapsed set of levels. Value op2 = highs[tid][reassoc.front()]; Value opc = builder.create(loc, arith::CmpIPredicate::ult, op1, op2); cond = cond ? builder.create(loc, cond, opc) : opc; // Update positions Value pos = after->getArgument(o++); // For COO, the position is the same across consecutive levels. llvm::for_each(reassoc, [this, tid, pos](Level lvl) { pidxs[tid][lvl] = pos; }); } } builder.create(loc, cond, before->getArguments()); // Generates while body. builder.setInsertionPointToStart(&whileOp.getAfter().front()); SmallVector> slicesPreds; unsigned i = 0; for (auto [tid, dim] : llvm::zip(tids, dims)) { // Prepares for next level. if (isCompressedDLT(dimTypes[tid][dim]) || isSingletonDLT(dimTypes[tid][dim])) { coord[tid][dim] = genSparseCrd(builder, loc, tid, dim); if (isSparseSlices[tid]) { Value load = genIndexLoad(builder, loc, crdBuffer[tid][dim], pidxs[tid][dim]); auto enc = getSparseTensorEncoding(tensors[tid].getType()); auto [trans, pred] = genSliceLegitPredicate(builder, loc, load, enc, dim); slicesPreds.emplace_back(pred, i); // Updates to the relative coordinate to the slice. coord[tid][dim] = trans; } i++; } } if (!slicesPreds.empty()) { // Skips invalid loop iteration when slice coordinate is inapplicable. SmallVector yields(after->getArguments()); // Generates a list of if statments // pidx = in_slice ? pidx : pidx + 1 // TODO: instead of always picking pidx + 1, we should set pidx = high to // break to loop the coordinates is larger than the slice size. for (auto [pred, idx] : slicesPreds) { Value nextPidx = builder.create( loc, yields[idx], constantIndex(builder, loc, 1)); yields[idx] = builder.create(loc, pred, yields[idx], nextPidx); } Value pred = slicesPreds.front().first; for (int i = 1, e = slicesPreds.size(); i < e; i++) { pred = builder.create(loc, pred, slicesPreds[i].first); } auto ifOp = builder.create(loc, types, pred, /*else*/ true); ifOp->setAttr(getLoopEmitterLoopAttrName(), StringAttr::get(builder.getContext(), "slice")); builder.create(loc, ifOp->getResults()); assert(types.size() == yields.size()); // If not all slices are legit builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); builder.create(loc, yields); // If all slices are legit, start the user generated code. builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); } Value min; // Finds the minimum coordinate if (!needsUniv) { for (auto [tid, dim] : llvm::zip(tids, dims)) { if (isCompressedDLT(dimTypes[tid][dim]) || isSingletonDLT(dimTypes[tid][dim])) { if (min) { Value cmp = builder.create( loc, arith::CmpIPredicate::ult, coord[tid][dim], min); min = builder.create(loc, cmp, coord[tid][dim], min); } else { min = coord[tid][dim]; } } } } else { assert(!min); // Otherwise, universal index is the minimal pidx. min = after->getArguments().back(); } // Sets up the loop stack. loopStack.emplace_back(tids, dims, whileOp, builder.getInsertionBlock(), min, loopTag); assert(loopStack.size() == loopSeqStack.size()); for (auto [tid, dim] : llvm::zip(tids, dims)) { const auto reassoc = getCollapseReassociation(tid, dim); assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); // TODO: Refactors this into smaller functions. // NOTE: For all the collapsed level (except for the last one, that is why // the loop ends with `reassoc.size() - 1`), as each iteration is advanced // by the segment size of the last level, which does not always invalidate // the segment size for the previous levels, thus we need to propagate the // segment sizes across loop iterations and only forward if needed. // // E.g., for a COO tensor with the following coordinates array. // (0, 0, 1), // (0, 0, 2), // (1, 1, 1), // segHi[lvl=0] = segHi[lvl=1] = 2 // segHi[lvl=2] = 1, // the first iteration does not invalidate segHi[0] and segHi[1] for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { const auto lvl = reassoc[i]; if (!isUniqueDLT(dimTypes[tid][lvl])) { Value pos = pidxs[tid][lvl]; assert(segHi[tid][lvl]); Value newSegHi = builder.create( loc, arith::CmpIPredicate::uge, pos, segHi[tid][lvl]); auto ifOp = builder.create(loc, builder.getIndexType(), newSegHi, true); { OpBuilder::InsertionGuard guard(builder); builder.setInsertionPointToStart(ifOp.thenBlock()); builder.create( loc, genSegmentHigh(builder, loc, tid, lvl, pos, highs[tid][lvl])); // Else, resues the same segment high. builder.setInsertionPointToStart(ifOp.elseBlock()); builder.create(loc, segHi[tid][lvl]); } highs[tid][lvl + 1] = segHi[tid][lvl] = ifOp.getResult(0); } }; const auto lvl = reassoc.back(); if (!isUniqueDLT(dimTypes[tid][lvl])) { segHi[tid][lvl] = genSegmentHigh(builder, loc, tid, lvl, pidxs[tid][lvl], highs[tid][lvl]); } } // Emits extra locals emitExtraLocalsForTensorsAtDenseDims(builder, loc, tids, dims); // Updates reduction variables assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0)); // In-place update on reduction variable. for (unsigned i = 0, e = reduc.size(); i < e; i++) reduc[i] = after->getArgument(o + i); return whileOp; } void LoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid, size_t dim) { assert(dimTypes[tid].size() > dim); auto dimType = dimTypes[tid][dim]; if (isDenseDLT(dimType)) return; for (auto lvl : getCollapseReassociation(tid, dim)) { // Either the first level, or the previous level has been set. assert(lvl == 0 || pidxs[tid][lvl - 1]); Value c0 = constantIndex(builder, loc, 0); Value c1 = constantIndex(builder, loc, 1); if (isCompressedDLT(dimType)) { Value mem = posBuffer[tid][lvl]; Value pLo = lvl == 0 ? c0 : pidxs[tid][lvl - 1]; pidxs[tid][lvl] = genIndexLoad(builder, loc, mem, pLo); Value pHi = builder.create(loc, pLo, c1); highs[tid][lvl] = genIndexLoad(builder, loc, mem, pHi); return; } if (isSingletonDLT(dimType)) { Value pLo = lvl == 0 ? c0 : pidxs[tid][lvl - 1]; Value pHi; // If this is non-unique, the pHi is bound by the segment high of the // previous level. if (!isUniqueDLT(dimTypes[tid][lvl - 1])) pHi = segHi[tid][lvl - 1]; // If pHi is still uninitialized, we set it to one as it is a singleton // level. // NOTE: Even if the level is non-unique, the pHi might not have been set // in the previous statement, as we only compute segment high when we are // coiterating non-unique levels. if (!pHi) pHi = builder.create(loc, pLo, c1); pidxs[tid][lvl] = pLo; highs[tid][lvl] = pHi; return; } } llvm_unreachable("Unrecognizable dimesion type!"); } void LoopEmitter::emitExtraLocalsForTensorsAtDenseDims(OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims) { // Initialize dense positions. Note that we generate dense coordinates of the // output tensor unconditionally, since they may not appear in the lattice, // but may be needed for linearized codegen. for (auto [tid, dim] : llvm::zip(tids, dims)) { if (isDenseDLT(dimTypes[tid][dim])) { auto enc = getSparseTensorEncoding(tensors[tid].getType()); if (enc && !isSparseOutput(tid)) { bool validPidx = dim == 0 || pidxs[tid][dim - 1]; if (!validPidx) { // We might not find the pidx for the sparse output tensor as it is // unconditionally required by the sparsification. assert(isOutputTensor(tid)); continue; } pidxs[tid][dim] = genAddress(builder, loc, tid, dim, loopStack.back().iv); // NOTE: we can also prepare for next dim here in advance } } } } void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, MutableArrayRef reduc) { LoopLevelInfo &loopInfo = loopStack.back(); rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock); auto &dims = loopStack.back().dims; auto &tids = loopStack.back().tids; auto forOp = llvm::dyn_cast(loopInfo.loop); if (forOp) { if (!reduc.empty()) { assert(reduc.size() == forOp.getNumResults()); rewriter.create(loc, reduc); } // Exit the loop. rewriter.setInsertionPointAfter(forOp); // In-place update reduction variables. for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++) reduc[i] = forOp.getResult(i); } else { auto parOp = llvm::cast(loopInfo.loop); if (!reduc.empty()) { assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1); Operation *redExp = reduc.front().getDefiningOp(); // Reduction expression should have no use. assert(redExp->getUses().empty()); // This must be a binary operation. // NOTE: This is users' responsibilty to ensure the operation are // commutative. assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1); Value redVal = parOp.getInitVals().front(); Value curVal; if (redExp->getOperand(0) == redVal) curVal = redExp->getOperand(1); else if (redExp->getOperand(1) == redVal) curVal = redExp->getOperand(0); // One of the operands must be the init value (which is also the // previous reduction value). assert(curVal); // The reduction expression should be the only user of the reduction val // inside the parallel for. unsigned numUsers = 0; for (Operation *op : redVal.getUsers()) { if (op->getParentOp() == parOp) numUsers++; } assert(numUsers == 1); (void)numUsers; // to silence unused variable warning in release build rewriter.setInsertionPointAfter(redExp); auto redOp = rewriter.create(loc, curVal); // Attach to the reduction op. Block *redBlock = &redOp.getRegion().getBlocks().front(); rewriter.setInsertionPointToEnd(redBlock); Operation *newRed = rewriter.clone(*redExp); // Replaces arguments of the reduction expression by using the block // arguments from scf.reduce. rewriter.updateRootInPlace( newRed, [&]() { newRed->setOperands(redBlock->getArguments()); }); // Erases the out-dated reduction expression. rewriter.eraseOp(redExp); rewriter.setInsertionPointToEnd(redBlock); rewriter.create(loc, newRed->getResult(0)); } rewriter.setInsertionPointAfter(parOp); // In-place update reduction variables. for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++) reduc[i] = parOp.getResult(i); } // Finished iterating a tensor, clean up // We only do the clean up on for loop as while loops do not necessarily // finish the iteration on a sparse tensor for (auto [tid, dim] : llvm::zip(tids, dims)) { // Reset to null. coord[tid][dim] = Value(); pidxs[tid][dim] = Value(); // Dense dimension, high is fixed. if (!isDenseDLT(dimTypes[tid][dim])) highs[tid][dim] = Value(); } } void LoopEmitter::exitCoIterationLoop(OpBuilder &builder, Location loc, MutableArrayRef reduc) { const LoopLevelInfo &loopInfo = loopStack.back(); auto whileOp = llvm::cast(loopInfo.loop); builder.setInsertionPointToEnd(loopInfo.userCodeBlock); auto &dims = loopInfo.dims; auto &tids = loopInfo.tids; Value iv = loopInfo.iv; // Finalize the induction. Note that the induction could be performed // in the individual if-branches to avoid re-evaluating the conditions. // However, that would result in a rather elaborate forest of yield // instructions during code generation. Moreover, performing the induction // after the if-statements more closely resembles code generated by TACO. unsigned o = 0; SmallVector operands; Value one = constantIndex(builder, loc, 1); for (auto [tid, dim] : llvm::zip(tids, dims)) { if (isCompressedDLT(dimTypes[tid][dim]) || isSingletonDLT(dimTypes[tid][dim])) { const auto reassoc = getCollapseReassociation(tid, dim); assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType())); for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) { const auto lvl = reassoc[i]; if (!isUniqueDLT(dimTypes[tid][lvl])) { operands.push_back(segHi[tid][lvl]); o++; } } Value op1 = coord[tid][dim]; Value op3 = pidxs[tid][dim]; Value cmp = builder.create(loc, arith::CmpIPredicate::eq, op1, iv); // If the loop contains a coiteration with non-unique level, we fast // forward all the duplicated coords by setting the position to the // segment high. // If this is a collapsed dim, we forward pidx based on the last level in // the collapsed level set. Value add = !isUniqueDLT(dimTypes[tid][reassoc.back()]) ? segHi[tid][reassoc.back()] : builder.create(loc, op3, one); operands.push_back(builder.create(loc, cmp, add, op3)); // Following loops continue iteration from the break point of the // current while loop. Value pos = whileOp->getResult(o++); const auto t = tid; llvm::for_each(reassoc, [this, t, pos](Level l) { pidxs[t][l] = pos; }); // The coordinates are invalid now. coord[tid][dim] = nullptr; // The segment high are invalid now segHi[tid][dim] = nullptr; // highs remains unchanged. } } // Reduction value from users. for (auto &i : reduc) { operands.push_back(i); // In place update reduction variable. i = whileOp->getResult(o++); } // An (optional) universal index. if (operands.size() < whileOp.getNumResults()) { assert(operands.size() + 1 == whileOp.getNumResults()); // The last one is the universial index. operands.push_back(builder.create(loc, iv, one)); // update the loop starting point of current loop sequence loopSeqStack.back() = whileOp->getResult(o++); } assert(o == operands.size()); builder.create(loc, operands); builder.setInsertionPointAfter(whileOp); } void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc, MutableArrayRef reduc) { // Clean up the values, it would help use to discover potential bug at a // earlier stage (instead of silently using a wrong value). LoopLevelInfo &loopInfo = loopStack.back(); assert(loopInfo.tids.size() == loopInfo.dims.size()); SmallVector red; if (llvm::isa(loopInfo.loop)) { exitCoIterationLoop(rewriter, loc, reduc); } else { exitForLoop(rewriter, loc, reduc); } assert(loopStack.size() == loopSeqStack.size()); loopStack.pop_back(); }