//===- Transforms.cpp - Linalg transformations as patterns ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements logic and helpers to expose Linalg transforms as rewrite // patterns. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/SCF/Transforms/Transforms.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/Matchers.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include #include #define DEBUG_TYPE "linalg-transforms" using namespace mlir; using namespace mlir::linalg; #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ") //===----------------------------------------------------------------------===// // Transformations exposed as rewrite patterns. //===----------------------------------------------------------------------===// LinalgTilingOptions & mlir::linalg::LinalgTilingOptions::setTileSizes(ArrayRef ts) { assert(!tileSizeComputationFunction && "tile sizes already set"); SmallVector tileSizes(ts.begin(), ts.end()); tileSizeComputationFunction = [tileSizes](OpBuilder &b, Operation *op) { OpBuilder::InsertionGuard guard(b); b.setInsertionPointToStart( &op->getParentOfType().getBody().front()); return llvm::to_vector<4>(map_range(tileSizes, [&](int64_t s) { Value v = b.create(op->getLoc(), s); return v; })); }; return *this; } /// Pad the `opOperand` in the `paddingDimensions` using the padding value and /// the nofold flag found in `paddingValues` and `packPaddings`, respectively. /// Exit early and return the `opOperand` value if the shape dimensions that /// match `paddingDimensions` have a static size and the nofold flag is not set. /// Otherwise, try to pad the shape dimensions that match the iterator /// dimensions `paddingDimensions` and return the tensor::PadOp result if /// padding succeeds or failure otherwise. static FailureOr padOperandToSmallestStaticBoundingBox( OpBuilder &b, linalg::LinalgOp opToPad, OpOperand *opOperand, ArrayRef paddingDimensions, ArrayRef paddingValues, ArrayRef packPaddings) { AffineMap indexingMap = opToPad.getMatchingIndexingMap(opOperand); ArrayRef shape = opToPad.getShape(opOperand); // Collect the shape dimension that are a function of the `paddingDimensions`. llvm::SmallDenseSet shapeDimsToPad; for (int64_t dim : paddingDimensions) for (const auto &en : enumerate(indexingMap.getResults())) if (en.value().isFunctionOfDim(dim)) shapeDimsToPad.insert(en.index()); // Return the unpadded operand if padding to a static shape is not needed and // if the nofold flag is not set. bool nofold = opOperand->getOperandNumber() < packPaddings.size() ? packPaddings[opOperand->getOperandNumber()] : false; bool hasStaticShape = llvm::none_of(shapeDimsToPad, [&](int64_t dim) { return ShapedType::isDynamic(shape[dim]); }); if (!nofold && hasStaticShape) return opOperand->get(); // Fail if `paddingValues` specifies no padding value. if (opOperand->getOperandNumber() >= paddingValues.size()) return failure(); Attribute paddingAttr = paddingValues[opOperand->getOperandNumber()]; Type paddingType = b.getType(); if (auto typedAttr = paddingAttr.dyn_cast()) paddingType = typedAttr.getType(); Value paddingValue = b.create(opToPad.getLoc(), paddingType, paddingAttr); // Follow the use-def chain if `currOpOperand` is defined by a LinalgOp. OpOperand *currOpOperand = opOperand; while (auto linalgOp = currOpOperand->get().getDefiningOp()) { OpResult result = currOpOperand->get().cast(); currOpOperand = linalgOp.getDpsInitOperand(result.getResultNumber()); } // Fail if `currOpOperand` is not defined by an ExtractSliceOp. auto sliceOp = currOpOperand->get().getDefiningOp(); if (!sliceOp) return failure(); // Compute the dropped dimensions if `sliceOp` is ranke-reducing. llvm::SmallBitVector droppedDims = sliceOp.getDroppedDims(); OffsetSizeAndStrideOpInterface shapedOp = sliceOp; // Upper bound the `sliceOp` sizes to obtain a static bounding box. SmallVector paddedShape(shape.begin(), shape.end()); int64_t shapeIdx = 0; for (const auto &en : enumerate(shapedOp.getMixedSizes())) { // Skip dropped dimensions. if (droppedDims.test(en.index())) continue; // Skip dimensions that do not require padding. if (!shapeDimsToPad.contains(shapeIdx)) { shapeIdx++; continue; } // If the size is an attribute add it directly to `paddedShape`. if (en.value().is()) { paddedShape[shapeIdx++] = en.value().get().dyn_cast().getInt(); continue; } // Otherwise, try to compute a constant upper bound for the size value. FailureOr upperBound = getConstantUpperBoundForIndex(en.value().get()); if (failed(upperBound)) { LLVM_DEBUG(DBGS() << "No constant bounding box can be found for padding"); return failure(); } paddedShape[shapeIdx++] = *upperBound; } assert(shapeIdx == static_cast(shape.size()) && "expect the dynamic and static ranks to match"); // Pad the operand to the bounding box defined by `paddedShape`. auto paddedTensorType = RankedTensorType::get( paddedShape, getElementTypeOrSelf(opOperand->get())); return makeComposedPadHighOp(b, opToPad->getLoc(), paddedTensorType, opOperand->get(), paddingValue, nofold); } FailureOr> linalg::rewriteAsPaddedOp(OpBuilder &b, LinalgOp opToPad, ArrayRef paddingDimensions, ArrayRef paddingValues, ArrayRef packPaddings, LinalgOp &paddedOp) { Location loc = opToPad->getLoc(); // TODO: there are cases where we may still want to pad to larger sizes. assert(opToPad.hasTensorSemantics() && "expected operation to have tensor semantics"); OpBuilder::InsertionGuard g(b); // Set IP after op because we also take the dims of the original output. b.setInsertionPointAfter(opToPad); // Make a copy of the shaped operands and update it. SmallVector newOperands; newOperands.reserve(opToPad->getNumOperands()); for (OpOperand &opOperand : opToPad->getOpOperands()) { FailureOr paddedOperand = padOperandToSmallestStaticBoundingBox( b, opToPad, &opOperand, paddingDimensions, paddingValues, packPaddings); // Exit if `paddingDimensions` cannot be bounded statically. if (failed(paddedOperand)) return failure(); newOperands.push_back(*paddedOperand); } SmallVector> reifiedResultShapes; if (failed(cast(opToPad.getOperation()) .reifyResultShapes(b, reifiedResultShapes))) return failure(); assert(reifiedResultShapes.size() == opToPad->getNumResults() && "expected same number of results"); // Clone `opToPad` to operate on the statically padded shapes. auto resultTensorTypes = ValueRange(newOperands).take_back(opToPad.getNumDpsInits()).getTypes(); paddedOp = clone(b, opToPad, resultTensorTypes, newOperands); // Recover the slice out of the new static results. This keeps the original // linalg op around because it uses the dims of the original results. SmallVector paddedSubviewResults; paddedSubviewResults.reserve(opToPad->getNumResults()); for (const auto &en : llvm::enumerate(paddedOp->getResults())) { Value paddedResult = en.value(); int64_t resultNumber = en.index(); int64_t rank = paddedResult.getType().cast().getRank(); SmallVector offsets(rank, b.getIndexAttr(0)); SmallVector sizes; for (Value v : reifiedResultShapes[resultNumber]) sizes.push_back(getAsOpFoldResult(v)); SmallVector strides(rank, b.getIndexAttr(1)); paddedSubviewResults.push_back(b.create( loc, paddedResult, offsets, sizes, strides)); } return paddedSubviewResults; } /// Try to peel a loop `op` and return the new result. // TODO: Add support for scf.parallel and affine.for loops. SmallVector mlir::linalg::peelLoop(RewriterBase &rewriter, Operation *op) { return llvm::TypeSwitch>(op) .Case([&](scf::ForOp forOp) { scf::ForOp partialIteration; if (succeeded(scf::peelAndCanonicalizeForLoop(rewriter, forOp, partialIteration))) return partialIteration->getResults(); assert(!partialIteration && "expected that loop was not peeled"); return forOp->getResults(); }) .Default([&](Operation *op) { return op->getResults(); }); } /// Peel and canonicalize 'loops'. void mlir::linalg::peelLoops(RewriterBase &rewriter, ArrayRef loops) { for (auto loopOp : loops) peelLoop(rewriter, loopOp); } /// Linalg padding pattern. mlir::linalg::LinalgPaddingPattern::LinalgPaddingPattern( MLIRContext *context, LinalgPaddingOptions options, PatternBenefit benefit) : OpInterfaceRewritePattern(context, benefit), options(std::move(options)) {} FailureOr mlir::linalg::LinalgPaddingPattern::returningMatchAndRewrite( LinalgOp linalgOp, PatternRewriter &rewriter) const { if (!linalgOp.hasTensorSemantics()) return failure(); // Pad the operation. LinalgOp paddedOp; FailureOr> newResults = rewriteAsPaddedOp(rewriter, linalgOp, options.paddingDimensions, options.paddingValues, options.packPaddings, paddedOp); if (failed(newResults)) return failure(); // Hoist the padding. for (const auto &en : enumerate(options.hoistPaddings)) { if (static_cast(en.index()) >= paddedOp->getNumOperands()) break; OpOperand &opOperand = paddedOp->getOpOperand(en.index()); auto padOp = opOperand.get().getDefiningOp(); if (!padOp || en.value() == 0) continue; // Fail hoisting if the operand shape is not fully static. if (llvm::any_of(paddedOp.getShape(&opOperand), ShapedType::isDynamic)) return failure(); tensor::PadOp hoistedOp; SmallVector transposeOps; SmallVector transposeVector = en.index() < options.transposePaddings.size() ? options.transposePaddings[en.index()] : SmallVector{}; FailureOr newResult = hoistPaddingOnTensors( padOp, en.value(), transposeVector, hoistedOp, transposeOps); if (failed(newResult)) continue; rewriter.replaceOp(padOp, *newResult); } // Replace the original operation to pad. rewriter.replaceOp(linalgOp, *newResults); return paddedOp; } LogicalResult mlir::linalg::CopyVectorizationPattern::matchAndRewrite( memref::CopyOp copyOp, PatternRewriter &rewriter) const { return vectorizeCopy(rewriter, copyOp); } static SmallVector getNParallelLoopsAttrs(unsigned nParallelLoops) { return SmallVector(nParallelLoops, utils::IteratorType::parallel); } /// Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp (to /// initialize with pad_val) and GenericOp (to copy contents). LogicalResult PadOpTransformationPattern::matchAndRewrite(tensor::PadOp padOp, PatternRewriter &rewriter) const { auto inputShapedType = padOp.getSource().getType().cast(); auto resultShapedType = padOp.getResult().getType().cast(); // Bail on non-static shapes. if (!inputShapedType.hasStaticShape()) return failure(); if (!resultShapedType.hasStaticShape()) return failure(); // Only support padding with a constant for now, i.e. either: // 1. A BBarg from a different block. // 2. A value defined outside of the current block. Block &block = padOp.getRegion().front(); auto yieldOp = cast(block.getTerminator()); Value padValue = yieldOp.getValue(); Operation *definingOp = padValue.getDefiningOp(); if (definingOp && definingOp->getBlock() == &block) return failure(); if (!definingOp && padValue.cast().getOwner() == &block) return failure(); // Create tensor with the padded shape Location loc = padOp.getLoc(); SmallVector indices(resultShapedType.getRank(), rewriter.create(loc, 0)); Value emptyTensor = rewriter.create( loc, resultShapedType.getShape(), resultShapedType.getElementType()); // Initialize tensor with the pad value Value tmpTensor = rewriter .create(loc, ValueRange{padValue}, ValueRange{emptyTensor}) .result(); // Copy original contents into new tensor // Uses linalg.generic, but could be done with tensor.insert_slice SmallVector outputExprs; for (unsigned i = 0; i < resultShapedType.getRank(); ++i) { outputExprs.push_back(getAffineDimExpr(i, rewriter.getContext()) + padOp.getStaticLow()[i]); } SmallVector transferMaps = { rewriter.getMultiDimIdentityMap(inputShapedType.getRank()), AffineMap::get(resultShapedType.getRank(), /*symbolCount=*/0, outputExprs, rewriter.getContext())}; rewriter.replaceOpWithNewOp( padOp, resultShapedType, padOp.getSource(), tmpTensor, transferMaps, getNParallelLoopsAttrs(resultShapedType.getRank()), [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) { nestedBuilder.create(nestedLoc, args[0]); }); return success(); } /// Filling `dest` using FillOp constant padding value if possible. /// Otherwise, generate a tensor::GenerateOp. Value GeneralizePadOpPattern::createFillOrGenerateOp( PatternRewriter &rewriter, tensor::PadOp padOp, Value dest, const SmallVector &dynSizes) const { auto padValue = padOp.getConstantPaddingValue(); if (padValue) return rewriter.create(padOp.getLoc(), padValue, dest).result(); // Fill could not be optimized: Lower to tensor::GenerateOp with region. auto generateOp = rewriter.create( padOp.getLoc(), padOp.getResultType(), dynSizes); // Copy region to new op. BlockAndValueMapping bvm; padOp.getRegion().cloneInto(&generateOp.getRegion(), bvm); return generateOp; } LogicalResult GeneralizePadOpPattern::matchAndRewrite(tensor::PadOp padOp, PatternRewriter &rewriter) const { // Given an OpFoldResult, return an index-typed value. auto getIdxValue = [&](OpFoldResult ofr) { if (auto val = ofr.dyn_cast()) return val; return rewriter .create( padOp.getLoc(), ofr.get().cast().getInt()) .getResult(); }; auto resultType = padOp.getResultType(); // Compute size of EmptyOp. Any combination of static/dynamic is supported. SmallVector dynSizes; SmallVector staticSizes; for (unsigned dim = 0; dim < resultType.getRank(); ++dim) { if (resultType.isDynamicDim(dim)) { auto srcSize = rewriter.createOrFold( padOp.getLoc(), padOp.getSource(), dim); // Add low and high padding value. auto plusLow = rewriter.createOrFold( padOp.getLoc(), srcSize, getIdxValue(padOp.getMixedLowPad()[dim])); auto plusHigh = rewriter.createOrFold( padOp.getLoc(), plusLow, getIdxValue(padOp.getMixedHighPad()[dim])); dynSizes.push_back(plusHigh); } staticSizes.push_back(resultType.getDimSize(dim)); } // Init tensor and fill it with padding. Value emptyTensor = rewriter.create( padOp.getLoc(), staticSizes, resultType.getElementType(), dynSizes); Value fill = createFillOrGenerateOp(rewriter, padOp, emptyTensor, dynSizes); // Try optimize the copy of source. if (optimizeCopyFn && optimizeCopyFn(rewriter, padOp, fill).succeeded()) return success(); // tensor::PadOps cannot be optimized. Generate a InsertSliceOp instead // for copying the PadOp source. auto sourceType = padOp.getSourceType(); // Compute size of source of tensor::PadOp. SmallVector srcSizes; for (unsigned dim = 0; dim < sourceType.getRank(); ++dim) { if (sourceType.isDynamicDim(dim)) { srcSizes.push_back(rewriter.createOrFold( padOp.getLoc(), padOp.getSource(), dim)); } else { srcSizes.push_back(rewriter.getIndexAttr(sourceType.getDimSize(dim))); } } // Strides of InsertSliceOp are all 1. SmallVector strides(sourceType.getRank(), rewriter.getIndexAttr(1)); rewriter.replaceOpWithNewOp( padOp, padOp.getSource(), fill, padOp.getMixedLowPad(), srcSizes, strides); return success(); } LogicalResult ExtractSliceOfPadTensorSwapPattern::matchAndRewrite( tensor::ExtractSliceOp sliceOp, PatternRewriter &rewriter) const { if (!sliceOp.hasUnitStride()) return failure(); auto padOp = sliceOp.getSource().getDefiningOp(); if (!padOp) return failure(); bool zeroSliceGuard = true; if (controlFn) { if (Optional control = controlFn(sliceOp)) zeroSliceGuard = *control; else return failure(); } Operation *tiledPadOp = tensor::bubbleUpPadSlice(rewriter, padOp, sliceOp.getMixedOffsets(), sliceOp.getMixedSizes(), zeroSliceGuard); // All shapes are static and the data source is actually used. Rewrite into // pad(extract_slice(x)). rewriter.replaceOp(sliceOp, tiledPadOp->getResults()); return success(); } // The following are patterns for downscaling convolution ops with size-1 // window dimensions. // // Note that we'd eventually want to write such transformations in a generic // way, e.g., converting to linalg.generic, removing the size-1 dimensions, // and then turning back to named ops. But for now it's fine to have a few // patterns matching special ops to get started. template FailureOr DownscaleSizeOneWindowed2DConvolution:: returningMatchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const { if (convOp.hasBufferSemantics()) return failure(); // To be implemented. Value input = convOp.getInputs().front(); Value kernel = convOp.getInputs().back(); Value output = convOp.getOutputs().front(); auto inputType = input.getType().dyn_cast(); auto kernelType = kernel.getType().dyn_cast(); auto outputType = output.getType().dyn_cast(); auto kernelShape = kernelType.getShape(); auto outputShape = outputType.getShape(); // Get domain indices based on conv2D layout. int khIndex, kwIndex, ohIndex, owIndex; TypeSwitch(convOp) .Case([&](linalg::Conv2DNhwcHwcfOp op) { khIndex = 0; kwIndex = 1; ohIndex = 1; owIndex = 2; }) .Case([&](linalg::Conv2DNchwFchwOp op) { khIndex = 2; kwIndex = 3; ohIndex = 2; owIndex = 3; }) .Default([&](Operation *op) { llvm_unreachable("unexpected conv2d operation."); }); // Only handle the case where at least one of the window dimensions is // of size 1. Other cases can rely on tiling to reduce to such cases. int64_t khSize = kernelShape[khIndex], kwSize = kernelShape[kwIndex]; int64_t ohSize = outputShape[ohIndex], owSize = outputShape[owIndex]; bool removeH = (khSize == 1 && ohSize == 1); bool removeW = (kwSize == 1 && owSize == 1); if (!removeH && !removeW) return failure(); // Get new shapes and types for all operands by removing the size-1 // dimension. using RTTBuilder = RankedTensorType::Builder; RankedTensorType newInputType = RTTBuilder(inputType).dropDim((removeH ? ohIndex : owIndex)); RankedTensorType newKernelType = RTTBuilder(kernelType).dropDim((removeH ? khIndex : kwIndex)); RankedTensorType newOutputType = RTTBuilder(outputType).dropDim((removeH ? ohIndex : owIndex)); // Rank-reduce operands. Location loc = convOp.getLoc(); Value newInput = tensor::createCanonicalRankReducingExtractSliceOp( rewriter, loc, input, newInputType); Value newKernel = tensor::createCanonicalRankReducingExtractSliceOp( rewriter, loc, kernel, newKernelType); Value newOutput = tensor::createCanonicalRankReducingExtractSliceOp( rewriter, loc, output, newOutputType); // Rank-reduce strides and dilations too. // TODO: dropDim 1-liner helper. auto strides = llvm::to_vector<4>(convOp.getStrides().template getValues()); strides.erase(strides.begin() + (removeH ? 0 : 1)); auto stridesAttr = rewriter.getI64VectorAttr(strides); auto dilations = llvm::to_vector<4>(convOp.getDilations().template getValues()); dilations.erase(dilations.begin() + (removeH ? 0 : 1)); auto dilationsAttr = rewriter.getI64VectorAttr(dilations); auto conv1DOp = rewriter.create( loc, newOutputType, ValueRange{newInput, newKernel}, ValueRange{newOutput}, stridesAttr, dilationsAttr); // Insert back. Value inserted = tensor::createCanonicalRankReducingInsertSliceOp( rewriter, loc, conv1DOp.getResult(0), output); rewriter.replaceOp(convOp, inserted); return conv1DOp; } template struct linalg::DownscaleSizeOneWindowed2DConvolution; template struct linalg::DownscaleSizeOneWindowed2DConvolution; FailureOr DownscaleDepthwiseConv2DNhwcHwcOp::returningMatchAndRewrite( DepthwiseConv2DNhwcHwcOp convOp, PatternRewriter &rewriter) const { if (convOp.hasBufferSemantics()) return failure(); // To be implemented. Value input = convOp.getInputs().front(); Value kernel = convOp.getInputs().back(); Value output = convOp.getOutputs().front(); auto inputType = input.getType().dyn_cast(); auto kernelType = kernel.getType().dyn_cast(); auto outputType = output.getType().dyn_cast(); auto kernelShape = kernelType.getShape(); auto outputShape = outputType.getShape(); // Only handle the case where at least one of the window dimensions is // of size 1. Other cases can rely on tiling to reduce to such cases. int64_t khSize = kernelShape[0], kwSize = kernelShape[1]; int64_t ohSize = outputShape[1], owSize = outputShape[2]; bool removeH = (khSize == 1 && ohSize == 1); bool removeW = (kwSize == 1 && owSize == 1); if (!removeH && !removeW) return failure(); // Get new shapes and types for all operands by removing the size-1 // dimension. using RTTBuilder = RankedTensorType::Builder; RankedTensorType newInputType = RTTBuilder(inputType).dropDim((removeH ? 1 : 2)); RankedTensorType newKernelType = RTTBuilder(kernelType).dropDim((removeH ? 0 : 1)); RankedTensorType newOutputType = RTTBuilder(outputType).dropDim(removeH ? 1 : 2); // Rank-reduce operands. Location loc = convOp.getLoc(); Value newInput = tensor::createCanonicalRankReducingExtractSliceOp( rewriter, loc, input, newInputType); Value newKernel = tensor::createCanonicalRankReducingExtractSliceOp( rewriter, loc, kernel, newKernelType); Value newOutput = tensor::createCanonicalRankReducingExtractSliceOp( rewriter, loc, output, newOutputType); // Rank-reduce strides and dilations too. // TODO: dropDim 1-liner helper. auto strides = llvm::to_vector<4>(convOp.getStrides().getValues()); strides.erase(strides.begin() + (removeH ? 0 : 1)); auto stridesAttr = rewriter.getI64VectorAttr(strides); auto dilations = llvm::to_vector<4>(convOp.getDilations().getValues()); dilations.erase(dilations.begin() + (removeH ? 0 : 1)); auto dilationsAttr = rewriter.getI64VectorAttr(dilations); auto conv1DOp = rewriter.create( loc, newOutputType, ValueRange{newInput, newKernel}, ValueRange{newOutput}, stridesAttr, dilationsAttr); // Insert back. Value inserted = tensor::createCanonicalRankReducingInsertSliceOp( rewriter, loc, conv1DOp.getResult(0), output); rewriter.replaceOp(convOp, inserted); return conv1DOp; } void linalg::populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, PatternBenefit benefit) { patterns.add, DownscaleSizeOneWindowed2DConvolution, DownscaleDepthwiseConv2DNhwcHwcOp>(patterns.getContext(), benefit); }