//===- Vectorize.cpp - Vectorize Pass Impl ----------------------*- C++ -*-===// // // Copyright 2019 The MLIR Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================= // // This file implements vectorization of loops, operations and data types to // a target-independent, n-D virtual vector abstraction. // //===----------------------------------------------------------------------===// #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/MLFunctionMatcher.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Location.h" #include "mlir/IR/MLValue.h" #include "mlir/IR/SSAValue.h" #include "mlir/IR/Types.h" #include "mlir/Pass.h" #include "mlir/StandardOps/StandardOps.h" #include "mlir/Support/Functional.h" #include "mlir/Transforms/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; using namespace mlir; /// This pass implements a high-level vectorization strategy at the MLFunction /// level. This is implemented by: /// 1. matching arbitrarily nested loop patterns that are vectorizable; /// 2. analyzing those patterns for profitability; /// 3. applying those patterns iteratively by coarsening the loops, inserting /// a single explicit vector element AllocOp and an unaligned load/store /// operation. The full semantics of this unaligned load/store is still /// TBD. /// /// Loop transformation: // ==================== /// The choice of loop transformation to apply for coarsening vectorized loops /// is still subject to exploratory tradeoffs. In particular, say we want to /// vectorize by a factor 128, we want to transform the following input: /// for %i = %M to %N { /// %a = load(f(i)) /// /// Traditionally, one would vectorize late (after scheduling, tiling, /// memory promotion etc) say after stripmining (and potentially unrolling in /// the case of LLVM's SLP vectorizer): /// for %i = floor(%M, 128) to ceil(%N, 128) { /// for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) { /// load/store(f(ii)) /// /// We seek to vectorize early and freeze vector types before scheduling, so /// we want to generate a pattern that resembles: /// for %i = ? to ? step ? { /// unaligned_load/unaligned_store(g(i)) /// /// i. simply dividing the lower / upper bounds by 128 creates issues /// with representing expressions such as ii + 1 because now we only /// have access to original values that have been divided. Additional /// information is needed to specify accesses at below 128 granularity; /// ii. another alternative is to coarsen the loop step but this may have /// consequences on dependence analysis and fusability of loops: fusable /// loops probably need to have the same step (because we don't want to /// stripmine/unroll to enable fusion). /// As a consequence, we choose to represent the coarsening using the loop /// step for now and reevaluate in the future. Note that we can renormalize /// loop steps later if/when we have evidence that they are problematic. /// /// For the simple strawman example above, vectorizing for a 1-D vector /// abstraction of size 128 returns code similar to: /// %c0 = constant 0 : index /// for %i = %M to %N + 127 step 128 { /// %a = alloc() : memref<1xvector<128xf32>> /// %r = "n_d_unaligned_load"(%tensor, %i, %a, %c0) /// %16 = load %a[%c0] : memref<1xvector<128xf32>> /// /// Note this is still work in progress and not yet functional. #define DEBUG_TYPE "early-vect" static cl::list clVirtualVectorSize( "virtual-vector-size", cl::desc("Specify n-D virtual vector size for vectorization"), cl::ZeroOrMore); static cl::list clFastestVaryingPattern( "test-fastest-varying", cl::desc("Specify a 1-D, 2-D or 3-D pattern of fastest varying memory " "dimensions to match. See defaultPatterns in Vectorize.cpp for a " "description and examples. This is used for testing purposes"), cl::ZeroOrMore); /// Forward declaration. static FilterFunctionType isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension); // Build a bunch of predetermined patterns that will be traversed in order. // Due to the recursive nature of MLFunctionMatchers, this captures // arbitrarily nested pairs of loops at any position in the tree. /// Note that this currently only matches 2 nested loops and will be extended. // TODO(ntv): support 3-D loop patterns with a common reduction loop that can // be matched to GEMMs. static std::vector defaultPatterns() { using matcher::For; return std::vector{ // 3-D patterns For(isVectorizableLoopPtrFactory(2), For(isVectorizableLoopPtrFactory(1), For(isVectorizableLoopPtrFactory(0)))), // for i { for j { A[??f(not i, not j), f(i, not j), f(not i, j)];}} // test independently with: // --test-fastest-varying=1 --test-fastest-varying=0 For(isVectorizableLoopPtrFactory(1), For(isVectorizableLoopPtrFactory(0))), // for i { for j { A[??f(not i, not j), f(i, not j), ?, f(not i, j)];}} // test independently with: // --test-fastest-varying=2 --test-fastest-varying=0 For(isVectorizableLoopPtrFactory(2), For(isVectorizableLoopPtrFactory(0))), // for i { for j { A[??f(not i, not j), f(i, not j), ?, ?, f(not i, j)];}} // test independently with: // --test-fastest-varying=3 --test-fastest-varying=0 For(isVectorizableLoopPtrFactory(3), For(isVectorizableLoopPtrFactory(0))), // for i { for j { A[??f(not i, not j), f(not i, j), f(i, not j)];}} // test independently with: // --test-fastest-varying=0 --test-fastest-varying=1 For(isVectorizableLoopPtrFactory(0), For(isVectorizableLoopPtrFactory(1))), // for i { for j { A[??f(not i, not j), f(not i, j), ?, f(i, not j)];}} // test independently with: // --test-fastest-varying=0 --test-fastest-varying=2 For(isVectorizableLoopPtrFactory(0), For(isVectorizableLoopPtrFactory(2))), // for i { for j { A[??f(not i, not j), f(not i, j), ?, ?, f(i, not j)];}} // test independently with: // --test-fastest-varying=0 --test-fastest-varying=3 For(isVectorizableLoopPtrFactory(0), For(isVectorizableLoopPtrFactory(3))), // for i { A[??f(not i) , f(i)];} // test independently with: --test-fastest-varying=0 For(isVectorizableLoopPtrFactory(0)), // for i { A[??f(not i) , f(i), ?];} // test independently with: --test-fastest-varying=1 For(isVectorizableLoopPtrFactory(1)), // for i { A[??f(not i) , f(i), ?, ?];} // test independently with: --test-fastest-varying=2 For(isVectorizableLoopPtrFactory(2)), // for i { A[??f(not i) , f(i), ?, ?, ?];} // test independently with: --test-fastest-varying=3 For(isVectorizableLoopPtrFactory(3))}; } static std::vector makePatterns() { using matcher::For; if (clFastestVaryingPattern.empty()) { return defaultPatterns(); } switch (clFastestVaryingPattern.size()) { case 1: return {For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]))}; case 2: return {For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]), For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[1])))}; case 3: return {For( isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]), For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[1]), For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[2]))))}; default: assert(false && "Only up to 3-D fastest varying pattern supported atm"); } return std::vector(); } namespace { struct Vectorize : public FunctionPass { Vectorize() : FunctionPass(&Vectorize::passID) {} PassResult runOnMLFunction(MLFunction *f) override; // Thread-safe RAII contexts local to pass, BumpPtrAllocator freed on exit. MLFunctionMatcherContext MLContext; static char passID; }; } // end anonymous namespace char Vectorize::passID = 0; /////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate. ////// namespace { struct Strategy { DenseMap loopToVectorDim; }; } // end anonymous namespace /// Implements a simple strawman strategy for vectorization. /// Given a matched pattern `matches` of depth `patternDepth`, this strategy /// greedily assigns the fastest varying dimension ** of the vector ** to the /// innermost loop in the pattern. /// When coupled with a pattern that looks for the fastest varying dimension in /// load/store MemRefs, this creates a generic vectorization strategy that works /// for any loop in a hierarchy (outermost, innermost or intermediate). /// /// TODO(ntv): In the future we should additionally increase the power of the /// profitability analysis along 3 directions: /// 1. account for loop extents (both static and parametric + annotations); /// 2. account for data layout permutations; /// 3. account for impact of vectorization on maximal loop fusion. /// Then we can quantify the above to build a cost model and search over /// strategies. static bool analyzeProfitability(MLFunctionMatches matches, unsigned depthInPattern, unsigned patternDepth, Strategy *strategy) { for (auto m : matches) { auto *loop = cast(m.first); LLVM_DEBUG(dbgs() << "[early-vect][profitability] patternDepth: " << patternDepth << " depthInPattern: " << depthInPattern << " loop "); LLVM_DEBUG(loop->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); bool fail = analyzeProfitability(m.second, depthInPattern + 1, patternDepth, strategy); if (fail) { return fail; } assert(patternDepth > depthInPattern); if (patternDepth - depthInPattern <= clVirtualVectorSize.size()) { strategy->loopToVectorDim[loop] = clVirtualVectorSize.size() - (patternDepth - depthInPattern); LLVM_DEBUG(dbgs() << "[early-vect][profitability] vectorize @ " << strategy->loopToVectorDim[loop] << " loop "); LLVM_DEBUG(loop->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); } else { // Don't vectorize strategy->loopToVectorDim[loop] = -1; } } return false; } ///// end TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate ///// ////// TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. //// /// Gets a MemRefType of 1 vector with the same elemental type as `tmpl` and /// sizes specified by vectorSize. The MemRef lives in the same memory space as /// tmpl. The MemRef should be promoted to a closer memory address space in a /// later pass. static MemRefType getVectorizedMemRefType(MemRefType tmpl, ArrayRef vectorSizes) { auto elementType = tmpl.getElementType(); assert(!elementType.dyn_cast() && "Can't vectorize an already vector type"); assert(tmpl.getAffineMaps().empty() && "Unsupported non-implicit identity map"); return MemRefType::get({1}, VectorType::get(vectorSizes, elementType), {}, tmpl.getMemorySpace()); } /// Creates an unaligned load with the following semantics: /// 1. TODO(ntv): apply a `srcMap` to a `srcIndex` to represent a `srcMemRef` /// slice + permutations for loading from non-fastest varying dimensions. /// Note that generally, the fastest varying dimension should be part of the /// map otherwise global layout changes are likely needed to obtain an /// efficient load. This is an orthogonal cost model consideration; /// 2. load from the `srcMemRef` resulting from 1.; /// 3. store into a `dstMemRef` starting at offset `dstIndex`; /// 4. copy sizeof(dstMemRef) bytes with adjustements for boundaries; /// 5. TODO(ntv): broadcast along `broadcastMap` inside the `dstMemRef` to /// support patterns like scalar-to-vector and N-k dim MemRef slice /// The copy may overflow on the src side but not on the dst side. If the copy /// overflows on the src side, the `dstMemRef` will be padded with enough values /// to fill it completely. /// /// Usage: /// This n_d_unaligned_load op will be implemented as a PseudoOp for different /// backends. In its current form it is only used to load into a <1xvector>; /// where the vector may have any shape that is some multiple of the /// hardware-specific vector size used to implement the PseudoOp efficiently. /// This is used to implement "non-effecting padding" for early vectorization /// and allows higher-level passes in the codegen to not worry about /// hardware-specific implementation details. /// /// TODO(ntv): /// 1. implement this end-to-end for some backend; /// 2. support operation-specific padding values to properly implement /// "non-effecting padding"; /// 3. support input map for on-the-fly transpositions (point 1 above); /// 4. support broadcast map (point 5 above). /// /// TODO(andydavis,bondhugula,ntv): /// 1. generalize to support padding semantics and offsets within vector type. static void createUnalignedLoad(MLFuncBuilder *b, Location loc, SSAValue *srcMemRef, ArrayRef srcIndices, SSAValue *dstMemRef, ArrayRef dstIndices) { SmallVector operands; operands.reserve(1 + srcIndices.size() + 1 + dstIndices.size()); operands.insert(operands.end(), srcMemRef); operands.insert(operands.end(), srcIndices.begin(), srcIndices.end()); operands.insert(operands.end(), dstMemRef); operands.insert(operands.end(), dstIndices.begin(), dstIndices.end()); using functional::map; std::function getType = [](SSAValue *v) -> Type { return v->getType(); }; auto types = map(getType, operands); OperationState opState(b->getContext(), loc, "n_d_unaligned_load", operands, types); b->createOperation(opState); } /// Creates an unaligned store with the following semantics: /// 1. TODO(ntv): apply a `srcMap` to a `srcIndex` to represent a `srcMemRef` /// slice to support patterns like vector-to-scalar and N-k dim MemRef slice. /// This is used as the counterpart to the broadcast map in the UnalignedLoad; /// 2. load from the `srcMemRef` resulting from 1.; /// 3. store into a `dstMemRef` starting at offset `dstIndex`; /// 4. TODO(ntv): apply a `dstMap` to a `dstIndex` to represent a `dstMemRef` /// slice + permutations for storing into non-fastest varying dimensions. /// Note that generally, the fastest varying dimension should be part of the /// map otherwise global layout changes are likely needed to obtain an /// efficient store. This is an orthogonal cost model consideration; /// 5. copy sizeof(srcMemRef) bytes with adjustements for boundaries; /// The copy may overflow on the dst side but not on the dst side. If the copy /// overflows on the dst side, the underlying implementation needs to resolve /// potential races. /// /// Usage: /// This n_d_unaligned_store op will be implemented as a PseudoOp for /// different backends. In its current form it is only used to store from a /// <1xvector>; where the vector may have any shape that is some multiple of /// the hardware-specific vector size used to implement the PseudoOp /// efficiently. This is used to implement "non-effecting padding" for early /// vectorization and allows higher-level passes in the codegen to not worry /// about hardware-specific implementation details. /// /// TODO(ntv): /// 1. implement this end-to-end for some backend; /// 2. support write-back in the presence of races and ; /// 3. support input map for counterpart of broadcast (point 1 above); /// 4. support dstMap for writing back in non-contiguous memory regions /// (point 4 above). static void createUnalignedStore(MLFuncBuilder *b, Location loc, SSAValue *srcMemRef, ArrayRef srcIndices, SSAValue *dstMemRef, ArrayRef dstIndices) { SmallVector operands; operands.reserve(1 + srcIndices.size() + 1 + dstIndices.size()); operands.insert(operands.end(), srcMemRef); operands.insert(operands.end(), srcIndices.begin(), srcIndices.end()); operands.insert(operands.end(), dstMemRef); operands.insert(operands.end(), dstIndices.begin(), dstIndices.end()); using functional::map; std::function getType = [](SSAValue *v) -> Type { return v->getType(); }; auto types = map(getType, operands); OperationState opState(b->getContext(), loc, "n_d_unaligned_store", operands, types); b->createOperation(opState); } /// The current implementation of vectorization materializes an AllocOp of /// MemRef<1 x vector_type> + a custom unaligned load/store pseudoop. /// The vector load/store accessing this MemRef always accesses element 0, so we /// just memoize a single 0 SSAValue, once upon function entry to avoid clutter. /// We create one such SSAValue per function. static SSAValue *insertZeroIndex(MLFunction *f) { static thread_local DenseMap zeros; if (zeros.find(f) == zeros.end()) { MLFuncBuilder b(f); auto zero = b.create(b.getUnknownLoc(), 0); zeros.insert(std::make_pair(f, zero)); } return zeros.lookup(f); } /// Unwraps a pointer type to another type (possibly the same). /// Used in particular to allow easier compositions of /// llvm::iterator_range types. template static std::function unwrapPtr() { return [](T *val) { return dyn_cast(val); }; } /// Materializes the n-D vector into an unpromoted temporary storage and /// explicitly copy into it. Materialization occurs in a MemRef containing 1 /// vector that lives in the same memory space as the base MemRef. Later passes /// should make the decision to promote this materialization to a faster address /// space. template static MLValue *materializeVector(MLValue *iv, LoadOrStoreOpPointer memoryOp, ArrayRef vectorSize) { auto memRefType = memoryOp->getMemRef()->getType().template cast(); auto vectorMemRefType = getVectorizedMemRefType(memRefType, vectorSize); // Materialize a MemRef with 1 vector. auto *opStmt = cast(memoryOp->getOperation()); MLFuncBuilder b(opStmt); // Create an AllocOp to apply the new shape. auto allocOp = b.create(opStmt->getLoc(), vectorMemRefType, ArrayRef{}); auto *allocMemRef = memoryOp->getMemRef(); using namespace functional; if (opStmt->template isa()) { createUnalignedLoad(&b, opStmt->getLoc(), allocMemRef, map(unwrapPtr(), memoryOp->getIndices()), allocOp->getResult(), {insertZeroIndex(iv->getFunction())}); } else { createUnalignedStore(&b, opStmt->getLoc(), allocOp->getResult(), {insertZeroIndex(iv->getFunction())}, allocMemRef, map(unwrapPtr(), memoryOp->getIndices())); } return cast(allocOp->getResult()); } /// end TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. /// namespace { struct VectorizationState { // `vectorizedByThisPattern` keeps track of statements that have already been // vectorized by this pattern. This allows distinguishing between DenseSet vectorizedByThisPattern; DenseSet vectorized; const Strategy *strategy; }; } // end anonymous namespace /// Terminal template function for creating a LoadOp. static OpPointer createLoad(MLFuncBuilder *b, Location loc, MLValue *memRef) { return b->create( loc, memRef, ArrayRef{insertZeroIndex(memRef->getFunction())}); } /// Terminal template function for creating a StoreOp. static OpPointer createStore(MLFuncBuilder *b, Location loc, MLValue *memRef, OpPointer store) { return b->create( loc, store->getValueToStore(), memRef, ArrayRef{insertZeroIndex(memRef->getFunction())}); } /// Vectorizes the `memoryOp` of type LoadOp or StoreOp along loop `iv` by /// factor `vectorSize`. /// In a first implementation, this triggers materialization of a vector Alloc. // TODO(ntv): this could be a view that changes the underlying element type. // Materialization of this view may or may not happen before legalization. template static bool vectorize(MLValue *iv, LoadOrStoreOpPointer memoryOp, ArrayRef vectorSize, VectorizationState *state) { auto *materializedMemRef = materializeVector(iv, memoryOp, vectorSize); auto *opStmt = cast(memoryOp->getOperation()); MLFuncBuilder b(opStmt); Operation *resultOperation; if (auto load = opStmt->template dyn_cast()) { auto res = createLoad(&b, opStmt->getLoc(), materializedMemRef); resultOperation = res->getOperation(); } else { auto store = opStmt->template dyn_cast(); auto res = createStore(&b, opStmt->getLoc(), materializedMemRef, store); resultOperation = res->getOperation(); } state->vectorizedByThisPattern.insert(cast(resultOperation)); return false; } // result == true => failure, TO // (ntv): Status enum static bool vectorizeForStmt(ForStmt *loop, AffineMap upperBound, ArrayRef vectorSize, int64_t step, VectorizationState *state) { LLVM_DEBUG(dbgs() << "[early-vect] vectorize loop "); LLVM_DEBUG(loop->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); using namespace functional; loop->setUpperBound(map(unwrapPtr(), loop->getUpperBoundOperands()), upperBound); loop->setStep(step); FilterFunctionType notVectorizedThisRound = [state](const Statement &stmt) { if (!matcher::isLoadOrStore(stmt)) { return false; } return state->vectorizedByThisPattern.count(cast(&stmt)) == 0; }; auto loadAndStores = matcher::Op(notVectorizedThisRound); auto matches = loadAndStores.match(loop); for (auto ls : matches) { auto *opStmt = cast(ls.first); auto load = opStmt->dyn_cast(); auto store = opStmt->dyn_cast(); LLVM_DEBUG(dbgs() << "[early-vect] vectorize op: "); LLVM_DEBUG(opStmt->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); bool vectorizationFails = load ? vectorize(loop, load, vectorSize, state) : vectorize(loop, store, vectorSize, state); LLVM_DEBUG(dbgs() << "fail: " << vectorizationFails << "\n"); if (vectorizationFails) { // Early exit and trigger RAII cleanups at the root. return true; } // Erase the original op. opStmt->erase(); } return false; } /// Returns a FilterFunctionType that can be used in MLFunctionMatcher to /// match a loop whose underlying load/store accesses are all varying along the /// `fastestVaryingMemRefDimension`. /// TODO(ntv): In the future, allow more interesting mixed layout permutation /// once we understand better the performance implications and we are confident /// we can build a cost model and a search procedure. static FilterFunctionType isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension) { return [fastestVaryingMemRefDimension](const Statement &forStmt) { const auto &loop = cast(forStmt); return isVectorizableLoopAlongFastestVaryingMemRefDim( loop, fastestVaryingMemRefDimension); }; } /// Forward-declaration. static bool vectorizeNonRoot(MLFunctionMatches matches, VectorizationState *state); /// Apply vectorization of `loop` according to `state`. This is only triggered /// if all vectorizations in `childrenMatches` have already succeeded /// recursively in DFS post-order. static bool doVectorize(ForStmt *loop, MLFunctionMatches childrenMatches, VectorizationState *state) { // 1. DFS postorder recursion, if any of my children fails, I fail too. auto fail = vectorizeNonRoot(childrenMatches, state); if (fail) { // Early exit and trigger RAII cleanups at the root. return true; } // 2. This loop may have been omitted from vectorization for various reasons // (e.g. due to the performance model or pattern depth > vector size). assert(state->strategy->loopToVectorDim.count(loop)); assert(state->strategy->loopToVectorDim.find(loop) != state->strategy->loopToVectorDim.end() && "Key not found"); int vectorDim = state->strategy->loopToVectorDim.lookup(loop); if (vectorDim < 0) { return false; } // 3. Actual post-order transformation. assert(vectorDim < clVirtualVectorSize.size() && "vector dim overflow"); // a. get actual vector size auto vectorSize = clVirtualVectorSize[vectorDim]; // b. loop transformation for early vectorization is still subject to // exploratory tradeoffs (see top of the file). auto ubMap = loop->getUpperBoundMap(); assert(ubMap.getRangeSizes().empty()); // c. apply coarsening, i.e.: // | ub -> ub = vectorSize - 1 // | step -> step * vectorSize std::function coarsenUb = [vectorSize](AffineExpr expr) { return expr + vectorSize - 1; }; auto newUbs = functional::map(coarsenUb, ubMap.getResults()); // d. recurse return vectorizeForStmt( loop, AffineMap::get(ubMap.getNumDims(), ubMap.getNumSymbols(), newUbs, {}), clVirtualVectorSize, loop->getStep() * vectorSize, state); } /// Non-root pattern iterates over the matches at this level, calls doVectorize /// and exits early if anything below fails. static bool vectorizeNonRoot(MLFunctionMatches matches, VectorizationState *state) { for (auto m : matches) { auto fail = doVectorize(cast(m.first), m.second, state); if (fail) { // Early exit and trigger RAII cleanups at the root. return true; } } return false; } /// Sets up error handling for this root loop. /// Vectorization is a recursive procedure where anything below can fail. /// The root match thus needs to maintain a clone for handling failure. /// Each root may succeed independently but will otherwise clean after itself if /// anything below it fails. static bool vectorizeRoot(MLFunctionMatches matches, VectorizationState *state) { for (auto m : matches) { auto *loop = cast(m.first); // Since patterns are recursive, they can very well intersect. // Since we do not want a fully greedy strategy in general, we decouple // pattern matching, from profitability analysis, from application. // As a consequence we must check that each root pattern is still // vectorizable. If a pattern is not vectorizable anymore, we just skip it. // TODO(ntv): implement a non-greedy profitability analysis that keeps only // non-intersecting patterns. if (!isVectorizableLoop(*loop)) { continue; } DenseMap nomap; MLFuncBuilder builder(loop->getFunction()); ForStmt *clonedLoop = cast(builder.clone(*loop, nomap)); auto fail = doVectorize(loop, m.second, state); if (!fail) { LLVM_DEBUG(dbgs() << "[early-vect] success vectorizing loop: "); LLVM_DEBUG(loop->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); } fail ? loop->erase() : clonedLoop->erase(); } return false; } /// Applies vectorization to the current MLFunction by searching over a bunch of /// predetermined patterns. PassResult Vectorize::runOnMLFunction(MLFunction *f) { /// Build a zero at the entry of the function to avoid clutter in every single /// vectorized loop. insertZeroIndex(f); for (auto pat : makePatterns()) { LLVM_DEBUG(dbgs() << "\n[early-vect] Input function is now:\n"); LLVM_DEBUG(f->print(dbgs())); LLVM_DEBUG(dbgs() << "\n[early-vect] match:\n"); auto matches = pat.match(f); Strategy strategy; auto fail = analyzeProfitability(matches, 0, pat.getDepth(), &strategy); assert(!fail); VectorizationState state; state.strategy = &strategy; fail = vectorizeRoot(matches, &state); assert(!fail); } return PassResult::Success; } FunctionPass *mlir::createVectorizePass() { return new Vectorize(); } static PassRegistration pass("vectorize", "Vectorize to a target independent n-D vector abstraction");