//===- LoopEmitter.cpp ----------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "LoopEmitter.h" #include "CodegenUtils.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" using namespace mlir; using namespace mlir::sparse_tensor; //===----------------------------------------------------------------------===// // File local helper functions. //===----------------------------------------------------------------------===// /// Generates a pointer/index load from the sparse storage scheme. Narrower /// data types need to be zero extended before casting the value into the /// index type used for looping and indexing. static Value genIndexLoad(OpBuilder &builder, Location loc, Value ptr, Value s) { // For the scalar case, we simply zero extend narrower indices into 64-bit // values before casting to index without a performance penalty. Here too, // however, indices that already are 64-bit, in theory, cannot express the // full range as explained above. Value load = builder.create(loc, ptr, s); if (!load.getType().isa()) { if (load.getType().getIntOrFloatBitWidth() < 64) load = builder.create(loc, builder.getI64Type(), load); load = builder.create(loc, builder.getIndexType(), load); } return load; } //===----------------------------------------------------------------------===// // Sparse tensor loop emitter class implementations //===----------------------------------------------------------------------===// Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, size_t tid, size_t dim, Value iv) { Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1]; Value mul = builder.create(loc, highs[tid][dim], p); Value add = builder.create(loc, mul, iv); return add; } LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput, bool isSparseOut, ArrayRef topSort) { initialize(tensors, loopTag, hasOutput, isSparseOut, topSort); } void LoopEmitter::initialize(ValueRange tensors, StringAttr loopTag, bool hasOutput, bool isSparseOut, ArrayRef topSort) { // First initializes fields. this->loopTag = loopTag; this->hasOutput = hasOutput; this->isSparseOut = isSparseOut; this->tensors.assign(tensors.begin(), tensors.end()); this->dimTypes.assign(tensors.size(), std::vector()); this->pidxs.assign(tensors.size(), std::vector()); this->coord.assign(tensors.size(), std::vector()); this->highs.assign(tensors.size(), std::vector()); this->ptrBuffer.assign(tensors.size(), std::vector()); this->idxBuffer.assign(tensors.size(), std::vector()); this->valBuffer.assign(tensors.size(), nullptr); this->loopStack.reserve(topSort.size()); this->sparsiferLoopLvlMap.assign(topSort.size(), 0); for (size_t tid = 0, e = tensors.size(); tid < e; tid++) { auto t = tensors[tid]; // a scalar or 0-dimension tensors if (isZeroRankedTensorOrScalar(t.getType())) continue; auto rtp = t.getType().cast(); auto rank = static_cast(rtp.getRank()); auto enc = getSparseTensorEncoding(rtp); // We always treat sparse output tensor as dense so that we always iterate // it based on dim size. if (enc && !(isOutputTensor(tid) && isSparseOut)) for (auto dimTp : enc.getDimLevelType()) dimTypes[tid].push_back(dimTp); else dimTypes[tid].assign(rank, DimLevelType::Dense); // Initialize using empty value. pidxs[tid].assign(rank, Value()); coord[tid].assign(rank, Value()); highs[tid].assign(rank, Value()); ptrBuffer[tid].assign(rank, Value()); idxBuffer[tid].assign(rank, Value()); } // FIXME: This map should be maintained outside loop emitter. for (unsigned i = 0, e = topSort.size(); i < e; i++) { // This is an inverse map of the topologically sorted loop index from // sparsifier. This is needed to map the AffineDimExpr back to the loopStack // index used in loop emitter. sparsiferLoopLvlMap[topSort[i]] = i; } } void LoopEmitter::initializeLoopEmit(OpBuilder &builder, Location loc, LoopEmitter::OutputUpdater updater) { // For every tensor, find lower and upper bound on dimensions, set the // same bounds on loop indices, and obtain dense or sparse buffer(s). for (size_t t = 0, e = tensors.size(); t < e; t++) { auto tensor = tensors[t]; auto rtp = tensor.getType().dyn_cast(); if (!rtp) // Skips only scalar, zero ranked tensor still need to be bufferized and // (probably) filled with zeros by users. continue; auto rank = rtp.getRank(); auto shape = rtp.getShape(); auto enc = getSparseTensorEncoding(rtp); uint64_t cooStart = enc ? getCOOStart(enc) : rank; // Scan all dimensions of current tensor. for (int64_t d = 0; d < rank; d++) { // This should be called only once at beginning. assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !highs[t][d]); // Handle sparse storage schemes. if (isCompressedDLT(dimTypes[t][d])) { // Generate sparse primitives to obtains pointer and indices. ptrBuffer[t][d] = genToPointers(builder, loc, tensor, d); idxBuffer[t][d] = genToIndices(builder, loc, tensor, d, cooStart); } else if (isSingletonDLT(dimTypes[t][d])) { // Singleton dimension, fetch indices. idxBuffer[t][d] = genToIndices(builder, loc, tensor, d, cooStart); } else { // Dense dimension, nothing to fetch. assert(isDenseDLT(dimTypes[t][d])); } // Find upper bound in current dimension. unsigned p = toOrigDim(enc, d); Value up = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, p); highs[t][d] = up; } // Perform the required bufferization. Dense inputs materialize // from the input tensors. Sparse inputs use sparse primitives to obtain the // values. // Delegates extra output initialization to clients. bool isOutput = isOutputTensor(t); Type elementType = rtp.getElementType(); if (!enc) { // Non-annotated dense tensors. auto denseTp = MemRefType::get(shape, elementType); Value denseVal = builder.create(loc, denseTp, tensor); // Dense outputs need special handling. if (isOutput && updater) denseVal = updater(builder, loc, denseVal, tensor); valBuffer[t] = denseVal; } else { // Annotated sparse tensors. // We also need the value buffer for annotated all dense `sparse` tensor. auto dynShape = {ShapedType::kDynamic}; auto sparseTp = MemRefType::get(dynShape, elementType); valBuffer[t] = builder.create(loc, sparseTp, tensor); } // NOTE: we can also prepare for 0 dim here in advance, this will hosit // some loop preparation from tensor iteration, but will also (undesirably) // hosit the code ouside if conditions. } } void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims) { // Universal Index start from 0 assert(loopSeqStack.size() == loopStack.size()); // Universal index starts from 0 loopSeqStack.emplace_back(constantIndex(builder, loc, 0)); // Prepares for all the tensors used in the current loop sequence. for (auto [tid, dim] : llvm::zip(tids, dims)) prepareLoopOverTensorAtDim(builder, loc, tid, dim); } Value LoopEmitter::genAffine(OpBuilder &builder, AffineExpr a, Location loc) { switch (a.getKind()) { case AffineExprKind::DimId: { unsigned idx = a.cast().getPosition(); return loopStack[sparsiferLoopLvlMap[idx]].iv; } case AffineExprKind::Add: { auto binOp = a.cast(); return builder.create( loc, genAffine(builder, binOp.getLHS(), loc), genAffine(builder, binOp.getRHS(), loc)); } case AffineExprKind::Mul: { auto binOp = a.cast(); return builder.create( loc, genAffine(builder, binOp.getLHS(), loc), genAffine(builder, binOp.getRHS(), loc)); } case AffineExprKind::Constant: { int64_t c = a.cast().getValue(); return constantIndex(builder, loc, c); } default: llvm_unreachable("unexpected affine subscript"); } } Operation *LoopEmitter::enterLoopOverTensorAtDim( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims, MutableArrayRef reduc, bool isParallel) { // TODO: support multiple return on parallel for? assert(!isParallel || reduc.size() <= 1); bool isSparseInput = false; size_t tid = tids.front(), dim = dims.front(); for (auto [t, d] : llvm::zip(tids, dims)) { assert(dimTypes[t].size() > d); // Must be a valid tid, dim pair assert(!coord[t][d]); // We cannot re-enter the same level auto dimType = dimTypes[t][d]; // Must be a recognizable DLT. assert(isDenseDLT(dimType) || isCompressedDLT(dimType) || isSingletonDLT(dimType)); bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType); // We can at most have one sparse input, otherwise, a while loop is required // to co-iterate multiple sparse tensors. assert(!isSparseInput || !isSparse); if (isSparse) { tid = t; dim = d; } isSparseInput = isSparseInput || isSparse; } Value step = constantIndex(builder, loc, 1); Value lo = isSparseInput ? pidxs[tid][dim] // current offset : loopSeqStack.back(); // univeral tid Value hi = highs[tid][dim]; Operation *loop = nullptr; Value iv; if (isParallel) { scf::ParallelOp parOp = builder.create(loc, lo, hi, step, reduc); builder.setInsertionPointToStart(parOp.getBody()); assert(parOp.getNumReductions() == reduc.size()); iv = parOp.getInductionVars()[0]; // In-place update on the reduction variable vector. // Note that the init vals is not the actual reduction variables but instead // used as a `special handle` to (temporarily) represent them. The // expression on init vals will be moved into scf.reduce and replaced with // the block arguments when exiting the loop (see exitForLoop). This is // needed as we can not build the actual reduction block and get the actual // reduction varaible before users fill parallel loop body. for (int i = 0, e = reduc.size(); i < e; i++) reduc[i] = parOp.getInitVals()[i]; loop = parOp; } else { scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); builder.setInsertionPointToStart(forOp.getBody()); iv = forOp.getInductionVar(); // In-place update on the reduction variable vector. assert(forOp.getNumRegionIterArgs() == reduc.size()); for (int i = 0, e = reduc.size(); i < e; i++) reduc[i] = forOp.getRegionIterArg(i); loop = forOp; } assert(loop && iv); if (isSparseInput) { pidxs[tid][dim] = iv; // Generating a load on the indices array yields the coordinate. Value ptr = idxBuffer[tid][dim]; coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv); } else { // Dense tensor, the coordinates is the inducation variable. coord[tid][dim] = iv; } // NOTE: we can also prepare for next dim here in advance // Push the loop into stack loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), loop, coord[tid][dim], loopTag); // Emit extra locals. emitExtraLocalsForTensorsAtDenseDims(builder, loc, tids, dims); return loop; } Operation *LoopEmitter::enterFilterLoopOverTensorAtDim( OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine, MutableArrayRef reduc) { assert(!affine.isa() && !isDenseDLT(dimTypes[tid][dim])); assert(dimTypes[tid].size() > dim); // We can not re-enter the same level. assert(!coord[tid][dim]); Value step = constantIndex(builder, loc, 1); Value lo = pidxs[tid][dim]; Value hi = highs[tid][dim]; // TODO: We should instead use a whileOp for filter loop to allow early // break when exceeding (for ordered dimensions). // TODO: There are many other potiential opportunities that we might apply in // the future. E.g., we could use binary search to located the pointer index. scf::ForOp forOp = builder.create(loc, lo, hi, step, reduc); // In-place update on the reduction variable vector. assert(forOp.getNumRegionIterArgs() == reduc.size()); for (int i = 0, e = reduc.size(); i < e; i++) reduc[i] = forOp.getRegionIterArg(i); builder.setInsertionPointToStart(forOp.getBody()); Value iv = forOp.getInductionVar(); pidxs[tid][dim] = iv; // Generating a load on the indices array yields the coordinate. Value ptr = idxBuffer[tid][dim]; coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv); // Generate an if condition to filter out indices that is not equal to the // result of the affine expression. Value expected = genAffine(builder, affine, loc); auto pred = builder.create(loc, arith::CmpIPredicate::eq, coord[tid][dim], expected); SmallVector types; for (Value red : reduc) { types.push_back(red.getType()); } bool hasReduc = !types.empty(); scf::IfOp ifOp = builder.create(loc, types, pred, /*else*/ hasReduc); if (hasReduc) { // scf.for (a) -> v // %s = scf.if (a) -> v // user-generated code. // else // yield a // yield %s builder.create(loc, ifOp.getResults()); builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); // On mismatch. builder.create(loc, reduc); } // Set the insert point to matched branch. builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); // NOTE: we can also prepare for next dim here in advance // Push the loop into stack loopStack.emplace_back(ArrayRef(tid), ArrayRef(dim), forOp, coord[tid][dim], nullptr); return forOp; } void LoopEmitter::genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine) { Value affineV = genAffine(builder, affine, loc); pidxs[tid][dim] = genAddress(builder, loc, tid, dim, affineV); } Operation *LoopEmitter::enterCoIterationOverTensorsAtDims( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims, bool needsUniv, MutableArrayRef reduc) { assert(tids.size() == dims.size()); SmallVector types; SmallVector operands; // Construct the while-loop with a parameter for each index. Type indexType = builder.getIndexType(); for (auto [tid, dim] : llvm::zip(tids, dims)) { if (isCompressedDLT(dimTypes[tid][dim]) || isSingletonDLT(dimTypes[tid][dim])) { assert(pidxs[tid][dim]); types.push_back(indexType); operands.push_back(pidxs[tid][dim]); } } // The position where user-supplied reduction variable starts. for (Value rec : reduc) { types.push_back(rec.getType()); operands.push_back(rec); } if (needsUniv) { types.push_back(indexType); // Update universal index. operands.push_back(loopSeqStack.back()); } assert(types.size() == operands.size()); scf::WhileOp whileOp = builder.create(loc, types, operands); SmallVector locs(types.size(), loc); Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); // Build the "before" region, which effectively consists // of a conjunction of "i < upper" tests on all induction. builder.setInsertionPointToStart(&whileOp.getBefore().front()); Value cond; unsigned o = 0; for (auto [tid, dim] : llvm::zip(tids, dims)) { if (isCompressedDLT(dimTypes[tid][dim]) || isSingletonDLT(dimTypes[tid][dim])) { Value op1 = before->getArgument(o); Value op2 = highs[tid][dim]; Value opc = builder.create(loc, arith::CmpIPredicate::ult, op1, op2); cond = cond ? builder.create(loc, cond, opc) : opc; // Update pidxs[tid][dim] = after->getArgument(o++); } } builder.create(loc, cond, before->getArguments()); // Generates while body. builder.setInsertionPointToStart(&whileOp.getAfter().front()); Value min; for (auto [tid, dim] : llvm::zip(tids, dims)) { // Prepares for next level. if (isCompressedDLT(dimTypes[tid][dim]) || isSingletonDLT(dimTypes[tid][dim])) { Value ptr = idxBuffer[tid][dim]; Value s = pidxs[tid][dim]; Value load = genIndexLoad(builder, loc, ptr, s); coord[tid][dim] = load; if (!needsUniv) { if (min) { Value cmp = builder.create( loc, arith::CmpIPredicate::ult, load, min); min = builder.create(loc, cmp, load, min); } else { min = load; } } } } if (needsUniv) { assert(!min); // Otherwise, universal index is the minimal pidx. min = after->getArguments().back(); } // Sets up the loop stack. loopStack.emplace_back(tids, dims, whileOp, min, loopTag); assert(loopStack.size() == loopSeqStack.size()); // Emits extra locals emitExtraLocalsForTensorsAtDenseDims(builder, loc, tids, dims); // Updates reduction variables assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0)); // In-place update on reduction variable. for (unsigned i = 0, e = reduc.size(); i < e; i++) reduc[i] = after->getArgument(o + i); return whileOp; } void LoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid, size_t dim) { assert(dimTypes[tid].size() > dim); auto dimType = dimTypes[tid][dim]; if (isDenseDLT(dimType)) return; // Either the first dimension, or the previous dimension has been set. assert(dim == 0 || pidxs[tid][dim - 1]); Value c0 = constantIndex(builder, loc, 0); Value c1 = constantIndex(builder, loc, 1); if (isCompressedDLT(dimType)) { Value ptr = ptrBuffer[tid][dim]; Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1]; pidxs[tid][dim] = genIndexLoad(builder, loc, ptr, pLo); Value pHi = builder.create(loc, pLo, c1); highs[tid][dim] = genIndexLoad(builder, loc, ptr, pHi); return; } if (isSingletonDLT(dimType)) { Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1]; Value pHi = builder.create(loc, pLo, c1); pidxs[tid][dim] = pLo; highs[tid][dim] = pHi; return; } llvm_unreachable("Unrecognizable dimesion type!"); } void LoopEmitter::emitExtraLocalsForTensorsAtDenseDims(OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef dims) { // Initialize dense positions. Note that we generate dense indices of the // output tensor unconditionally, since they may not appear in the lattice, // but may be needed for linearized codegen. for (auto [tid, dim] : llvm::zip(tids, dims)) { if (isDenseDLT(dimTypes[tid][dim])) { auto enc = getSparseTensorEncoding(tensors[tid].getType()); if (enc && !isSparseOutput(tid)) { bool validPidx = dim == 0 || pidxs[tid][dim - 1]; if (!validPidx) { // We might not find the pidx for the sparse output tensor as it is // unconditionally required by the sparsification. assert(isOutputTensor(tid)); continue; } pidxs[tid][dim] = genAddress(builder, loc, tid, dim, loopStack.back().iv); // NOTE: we can also prepare for next dim here in advance } } } } void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, MutableArrayRef reduc) { LoopLevelInfo &loopInfo = loopStack.back(); auto &dims = loopStack.back().dims; auto &tids = loopStack.back().tids; auto forOp = llvm::dyn_cast(loopInfo.loop); if (forOp) { if (!reduc.empty()) { assert(reduc.size() == forOp.getNumResults()); rewriter.create(loc, reduc); } // Exit the loop. rewriter.setInsertionPointAfter(forOp); // In-place update reduction variables. for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++) reduc[i] = forOp.getResult(i); } else { auto parOp = llvm::cast(loopInfo.loop); if (!reduc.empty()) { assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1); Operation *redExp = reduc.front().getDefiningOp(); // Reduction expression should have no use. assert(redExp->getUses().empty()); // This must be a binary operation. // NOTE: This is users' responsibilty to ensure the operation are // commutative. assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1); Value redVal = parOp.getInitVals().front(); Value curVal; if (redExp->getOperand(0) == redVal) curVal = redExp->getOperand(1); else if (redExp->getOperand(1) == redVal) curVal = redExp->getOperand(0); // One of the operands must be the init value (which is also the // previous reduction value). assert(curVal); // The reduction expression should be the only user of the reduction val // inside the parallel for. unsigned numUsers = 0; for (Operation *op : redVal.getUsers()) { if (op->getParentOp() == parOp) numUsers++; } assert(numUsers == 1); (void)numUsers; // to silence unused variable warning in release build rewriter.setInsertionPointAfter(redExp); auto redOp = rewriter.create(loc, curVal); // Attach to the reduction op. Block *redBlock = &redOp.getRegion().getBlocks().front(); rewriter.setInsertionPointToEnd(redBlock); Operation *newRed = rewriter.clone(*redExp); // Replaces arguments of the reduction expression by using the block // arguments from scf.reduce. rewriter.updateRootInPlace( newRed, [&]() { newRed->setOperands(redBlock->getArguments()); }); // Erases the out-dated reduction expression. rewriter.eraseOp(redExp); rewriter.setInsertionPointToEnd(redBlock); rewriter.create(loc, newRed->getResult(0)); } rewriter.setInsertionPointAfter(parOp); // In-place update reduction variables. for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++) reduc[i] = parOp.getResult(i); } // Finished iterating a tensor, clean up // We only do the clean up on for loop as while loops do not necessarily // finish the iteration on a sparse tensor for (auto [tid, dim] : llvm::zip(tids, dims)) { // Reset to null. coord[tid][dim] = Value(); pidxs[tid][dim] = Value(); // Dense dimension, high is fixed. if (!isDenseDLT(dimTypes[tid][dim])) highs[tid][dim] = Value(); } } void LoopEmitter::exitCoIterationLoop(OpBuilder &builder, Location loc, MutableArrayRef reduc) { auto whileOp = llvm::cast(loopStack.back().loop); auto &dims = loopStack.back().dims; auto &tids = loopStack.back().tids; Value iv = loopStack.back().iv; // Generation while loop induction at the end. builder.setInsertionPointToEnd(&whileOp.getAfter().front()); // Finalize the induction. Note that the induction could be performed // in the individual if-branches to avoid re-evaluating the conditions. // However, that would result in a rather elaborate forest of yield // instructions during code generation. Moreover, performing the induction // after the if-statements more closely resembles code generated by TACO. unsigned o = 0; SmallVector operands; Value one = constantIndex(builder, loc, 1); for (auto [tid, dim] : llvm::zip(tids, dims)) { if (isCompressedDLT(dimTypes[tid][dim]) || isSingletonDLT(dimTypes[tid][dim])) { Value op1 = coord[tid][dim]; Value op3 = pidxs[tid][dim]; Value cmp = builder.create(loc, arith::CmpIPredicate::eq, op1, iv); Value add = builder.create(loc, op3, one); operands.push_back(builder.create(loc, cmp, add, op3)); // Following loops continue iteration from the break point of the // current while loop. pidxs[tid][dim] = whileOp->getResult(o++); // The coordinates are invalid now. coord[tid][dim] = nullptr; // highs remains unchanged. } } // Reduction value from users. for (auto &i : reduc) { operands.push_back(i); // In place update reduction variable. i = whileOp->getResult(o++); } // An (optional) universal index. if (operands.size() < whileOp.getNumResults()) { assert(operands.size() + 1 == whileOp.getNumResults()); // The last one is the universial index. operands.push_back(builder.create(loc, iv, one)); // update the loop starting point of current loop sequence loopSeqStack.back() = whileOp->getResult(o++); } assert(o == operands.size()); builder.create(loc, operands); builder.setInsertionPointAfter(whileOp); } void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc, MutableArrayRef reduc) { // Clean up the values, it would help use to discover potential bug at a // earlier stage (instead of silently using a wrong value). LoopLevelInfo &loopInfo = loopStack.back(); assert(loopInfo.tids.size() == loopInfo.dims.size()); SmallVector red; if (llvm::isa(loopInfo.loop)) { exitCoIterationLoop(rewriter, loc, reduc); } else { exitForLoop(rewriter, loc, reduc); } assert(loopStack.size() == loopSeqStack.size()); loopStack.pop_back(); }