Trying to activate both LLVM and MLIR passes in mlir-cpu-runner showed name collisions when registering pass names.
One possible way of disambiguating that should also work across dialects is to prepend the dialect name to the passes that specifically operate on that dialect.
With this CL, mlir-cpu-runner tests still run when both LLVM and MLIR passes are registered
--
PiperOrigin-RevId: 246539917
780 lines
33 KiB
C++
780 lines
33 KiB
C++
//===- MaterializeVectors.cpp - MaterializeVectors Pass Impl ----*- C++ -*-===//
|
|
//
|
|
// Copyright 2019 The MLIR Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
// =============================================================================
|
|
//
|
|
// This file implements target-dependent materialization of super-vectors to
|
|
// vectors of the proper size for the hardware.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/AffineOps/AffineOps.h"
|
|
#include "mlir/Analysis/AffineAnalysis.h"
|
|
#include "mlir/Analysis/Dominance.h"
|
|
#include "mlir/Analysis/LoopAnalysis.h"
|
|
#include "mlir/Analysis/NestedMatcher.h"
|
|
#include "mlir/Analysis/SliceAnalysis.h"
|
|
#include "mlir/Analysis/Utils.h"
|
|
#include "mlir/Analysis/VectorAnalysis.h"
|
|
#include "mlir/IR/AffineExpr.h"
|
|
#include "mlir/IR/AffineMap.h"
|
|
#include "mlir/IR/Attributes.h"
|
|
#include "mlir/IR/Builders.h"
|
|
#include "mlir/IR/Location.h"
|
|
#include "mlir/IR/OperationSupport.h"
|
|
#include "mlir/IR/Types.h"
|
|
#include "mlir/Pass/Pass.h"
|
|
#include "mlir/StandardOps/Ops.h"
|
|
#include "mlir/Support/Functional.h"
|
|
#include "mlir/Support/LLVM.h"
|
|
#include "mlir/Transforms/Passes.h"
|
|
#include "mlir/VectorOps/VectorOps.h"
|
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
///
|
|
/// Implements target-dependent materialization of virtual super-vectors to
|
|
/// vectors of the proper size for the hardware.
|
|
///
|
|
/// While the physical vector size is target-dependent, the pass is written in
|
|
/// a target-independent way: the target vector size is specified as a parameter
|
|
/// to the pass. This pass is thus a partial lowering that opens the "greybox"
|
|
/// that is the super-vector abstraction. In particular, this pass can turn the
|
|
/// vector.transfer_read and vector.transfer_write ops in either:
|
|
/// 1. a loop nest with either scalar and vector load/store operations; or
|
|
/// 2. a loop-nest with DmaStartOp / DmaWaitOp; or
|
|
/// 3. a pre-existing blackbox library call that can be written manually or
|
|
/// synthesized using search and superoptimization.
|
|
/// An important feature that either of these 3 target lowering abstractions
|
|
/// must handle is the handling of "non-effecting" padding with the proper
|
|
/// neutral element in order to guarantee that all "partial tiles" are actually
|
|
/// "full tiles" in practice.
|
|
///
|
|
/// In particular this pass is a MLIR-MLIR rewriting and does not concern itself
|
|
/// with target-specific instruction-selection and register allocation. These
|
|
/// will happen downstream in LLVM.
|
|
///
|
|
/// In this sense, despite performing lowering to a target-dependent size, this
|
|
/// pass is still target-agnostic.
|
|
///
|
|
/// Implementation details
|
|
/// ======================
|
|
/// The current decisions made by the super-vectorization pass guarantee that
|
|
/// use-def chains do not escape an enclosing vectorized AffineForOp. In other
|
|
/// words, this pass operates on a scoped program slice. Furthermore, since we
|
|
/// do not vectorize in the presence of conditionals for now, sliced chains are
|
|
/// guaranteed not to escape the innermost scope, which has to be either the top
|
|
/// Function scope or the innermost loop scope, by construction. As a
|
|
/// consequence, the implementation just starts from vector.transfer_write
|
|
/// operations and builds the slice scoped the innermost loop enclosing the
|
|
/// current vector.transfer_write. These assumptions and the implementation
|
|
/// details are subject to revision in the future.
|
|
///
|
|
/// Example
|
|
/// ========
|
|
/// In the following, the single vector.transfer_write op operates on a
|
|
/// vector<4x4x4xf32>. Let's assume the HW supports vector<4x4xf32>.
|
|
/// Materialization is achieved by instantiating each occurrence of the leading
|
|
/// dimension of vector<4x4x4xf32> into a vector<4x4xf32>.
|
|
/// The program transformation that implements this instantiation is a
|
|
/// multi-loop unroll-and-jam (it can be partial or full depending on the ratio
|
|
/// of super-vector shape to HW-vector shape).
|
|
///
|
|
/// As a simple case, the following:
|
|
///
|
|
/// ```mlir
|
|
/// mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) {
|
|
/// %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32>
|
|
/// %f1 = constant splat<vector<4x4x4xf32>, 1.000000e+00> :
|
|
/// vector<4x4x4xf32> affine.for %i0 = 0 to %M step 4 {
|
|
/// affine.for %i1 = 0 to %N step 4 {
|
|
/// affine.for %i2 = 0 to %O {
|
|
/// affine.for %i3 = 0 to %P step 4 {
|
|
/// vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3]
|
|
/// {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} :
|
|
/// vector<4x4x4xf32>, memref<?x?x?x?xf32>
|
|
/// }}}}
|
|
/// return
|
|
/// }
|
|
/// ```
|
|
///
|
|
/// is instantiated by unroll-and-jam (just unroll in this case) into:
|
|
///
|
|
/// ```mlir
|
|
/// mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) {
|
|
/// %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
|
|
/// %f1 = constant splat<vector<4x4xf32>, 1.000000e+00> : vector<4x4x4xf32>
|
|
/// affine.for %i0 = 0 to %arg0 step 4 {
|
|
/// affine.for %i1 = 0 to %arg1 step 4 {
|
|
/// affine.for %i2 = 0 to %arg2 {
|
|
/// affine.for %i3 = 0 to %arg3 step 4 {
|
|
/// vector.transfer_write f1, %0[%i0, %i1, %i2, %i3]
|
|
/// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} :
|
|
/// vector<4x4xf32>, memref<?x?x?x?xf32>
|
|
/// %i3p1 = affine.apply (d0) -> (d0 + 1)(%i3)
|
|
/// vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p1]
|
|
/// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} :
|
|
/// vector<4x4xf32>, memref<?x?x?x?xf32>
|
|
/// %i3p2 = affine.apply (d0) -> (d0 + 2)(%i3)
|
|
/// vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p2]
|
|
/// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} :
|
|
/// vector<4x4xf32>, memref<?x?x?x?xf32>
|
|
/// %i3p3 = affine.apply (d0) -> (d0 + 3)(%i3)
|
|
/// vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p3]
|
|
/// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} :
|
|
/// vector<4x4xf32>, memref<?x?x?x?xf32>
|
|
/// }}}}
|
|
/// return
|
|
/// }
|
|
/// ```
|
|
|
|
using llvm::dbgs;
|
|
using llvm::DenseSet;
|
|
using llvm::SetVector;
|
|
|
|
using namespace mlir;
|
|
|
|
using functional::makePtrDynCaster;
|
|
using functional::map;
|
|
|
|
static llvm::cl::list<int>
|
|
clVectorSize("vector-size",
|
|
llvm::cl::desc("Specify the HW vector size for vectorization"),
|
|
llvm::cl::ZeroOrMore);
|
|
|
|
#define DEBUG_TYPE "materialize-vect"
|
|
|
|
namespace {
|
|
struct MaterializationState {
|
|
/// In practice, the determination of the HW-specific vector type to use when
|
|
/// lowering a super-vector type must be based on the elemental type. The
|
|
/// elemental type must be retrieved from the super-vector type. In the future
|
|
/// information about hardware vector type for a particular elemental type
|
|
/// will be part of the contract between MLIR and the backend.
|
|
///
|
|
/// For example, 8xf32 has the same size as 16xf16 but the targeted HW itself
|
|
/// may exhibit the following property:
|
|
/// 1. have a special unit for a 128xf16 datapath;
|
|
/// 2. no F16 FPU support on the regular 8xf32/16xf16 vector datapath.
|
|
///
|
|
/// For now, we just assume hwVectorSize has the proper information regardless
|
|
/// of the type and we assert everything is f32.
|
|
/// TODO(ntv): relax the assumptions on admissible element type once a
|
|
/// contract exists.
|
|
MaterializationState(SmallVector<int64_t, 8> sizes) : hwVectorSize(sizes) {}
|
|
|
|
SmallVector<int64_t, 8> hwVectorSize;
|
|
VectorType superVectorType;
|
|
VectorType hwVectorType;
|
|
SmallVector<unsigned, 8> hwVectorInstance;
|
|
DenseMap<Value *, Value *> *substitutionsMap;
|
|
};
|
|
|
|
/// Base state for the vector materialization pass.
|
|
/// Command line arguments are preempted by non-empty pass arguments.
|
|
struct MaterializeVectorsPass : public FunctionPass<MaterializeVectorsPass> {
|
|
MaterializeVectorsPass()
|
|
: hwVectorSize(clVectorSize.begin(), clVectorSize.end()) {}
|
|
MaterializeVectorsPass(ArrayRef<int64_t> hwVectorSize)
|
|
: MaterializeVectorsPass() {
|
|
if (!hwVectorSize.empty())
|
|
this->hwVectorSize.assign(hwVectorSize.begin(), hwVectorSize.end());
|
|
}
|
|
|
|
SmallVector<int64_t, 8> hwVectorSize;
|
|
void runOnFunction() override;
|
|
};
|
|
|
|
} // end anonymous namespace
|
|
|
|
/// Given a shape with sizes greater than 0 along all dimensions,
|
|
/// returns the distance, in number of elements, between a slice in a dimension
|
|
/// and the next slice in the same dimension.
|
|
/// e.g. shape[3, 4, 5] -> strides[20, 5, 1]
|
|
static SmallVector<unsigned, 8> makeStrides(ArrayRef<unsigned> shape) {
|
|
SmallVector<unsigned, 8> tmp;
|
|
tmp.reserve(shape.size());
|
|
unsigned running = 1;
|
|
for (auto rit = shape.rbegin(), reit = shape.rend(); rit != reit; ++rit) {
|
|
assert(*rit > 0 && "size must be greater than 0 along all dimensions of "
|
|
"shape");
|
|
tmp.push_back(running);
|
|
running *= *rit;
|
|
}
|
|
return SmallVector<unsigned, 8>(tmp.rbegin(), tmp.rend());
|
|
}
|
|
|
|
/// Given a shape with sizes greater than 0 along all dimensions, returns the
|
|
/// delinearized components of linearIndex along shape.
|
|
static SmallVector<unsigned, 8> delinearize(unsigned linearIndex,
|
|
ArrayRef<unsigned> shape) {
|
|
SmallVector<unsigned, 8> res;
|
|
res.reserve(shape.size());
|
|
auto strides = makeStrides(shape);
|
|
for (unsigned idx = 0; idx < strides.size(); ++idx) {
|
|
assert(strides[idx] > 0);
|
|
auto val = linearIndex / strides[idx];
|
|
res.push_back(val);
|
|
assert(val < shape[idx] && "delinearization is out of bounds");
|
|
linearIndex %= strides[idx];
|
|
}
|
|
// Sanity check.
|
|
assert(linearIndex == 0 && "linear index constructed from shape must "
|
|
"have 0 remainder after delinearization");
|
|
return res;
|
|
}
|
|
|
|
static Operation *instantiate(FuncBuilder *b, Operation *opInst,
|
|
VectorType hwVectorType,
|
|
DenseMap<Value *, Value *> *substitutionsMap);
|
|
|
|
/// Not all Values belong to a program slice scoped within the immediately
|
|
/// enclosing loop.
|
|
/// One simple example is constants defined outside the innermost loop scope.
|
|
/// For such cases the substitutionsMap has no entry and we allow an additional
|
|
/// insertion.
|
|
/// For now, this is limited to ConstantOp because we do not vectorize loop
|
|
/// indices and will need to be extended in the future.
|
|
///
|
|
/// If substitution fails, returns nullptr.
|
|
static Value *substitute(Value *v, VectorType hwVectorType,
|
|
DenseMap<Value *, Value *> *substitutionsMap) {
|
|
auto it = substitutionsMap->find(v);
|
|
if (it == substitutionsMap->end()) {
|
|
auto *opInst = v->getDefiningOp();
|
|
if (opInst->isa<ConstantOp>()) {
|
|
FuncBuilder b(opInst);
|
|
auto *op = instantiate(&b, opInst, hwVectorType, substitutionsMap);
|
|
auto res = substitutionsMap->insert(std::make_pair(v, op->getResult(0)));
|
|
assert(res.second && "Insertion failed");
|
|
return res.first->second;
|
|
}
|
|
v->getDefiningOp()->emitError("Missing substitution");
|
|
return nullptr;
|
|
}
|
|
return it->second;
|
|
}
|
|
|
|
/// Returns a list of single result AffineApplyOps that reindex the
|
|
/// `memRefIndices` by the multi-dimensional `hwVectorInstance`. This is used by
|
|
/// the function that materializes a vector.transfer operation to use hardware
|
|
/// vector types instead of super-vector types.
|
|
///
|
|
/// The general problem this function solves is as follows:
|
|
/// Assume a vector.transfer operation at the super-vector granularity that has
|
|
/// `l` enclosing loops (AffineForOp). Assume the vector transfer operation
|
|
/// operates on a MemRef of rank `r`, a super-vector of rank `s` and a hardware
|
|
/// vector of rank `h`. For the purpose of illustration assume l==4, r==3, s==2,
|
|
/// h==1 and that the super-vector is vector<3x32xf32> and the hardware vector
|
|
/// is vector<8xf32>. Assume the following MLIR snippet after
|
|
/// super-vectorization has been applied:
|
|
///
|
|
/// ```mlir
|
|
/// affine.for %i0 = 0 to %M {
|
|
/// affine.for %i1 = 0 to %N step 3 {
|
|
/// affine.for %i2 = 0 to %O {
|
|
/// affine.for %i3 = 0 to %P step 32 {
|
|
/// %r = vector.transfer_read(%A, map0(%i..), map1(%i..), map2(%i..)) :
|
|
/// vector<3x32xf32>, memref<?x?x?xf32>
|
|
/// ...
|
|
/// }}}}
|
|
/// ```
|
|
///
|
|
/// where map denotes an AffineMap operating on enclosing loops with properties
|
|
/// compatible for vectorization (i.e. some contiguity left unspecified here).
|
|
/// Note that the vectorized loops are %i1 and %i3.
|
|
/// This function translates the vector.transfer_read operation to multiple
|
|
/// instances of vector.transfer_read that operate on vector<8x32>.
|
|
///
|
|
/// Without loss of generality, we assume hwVectorInstance is: {2, 1}.
|
|
/// The only constraints on hwVectorInstance is they belong to:
|
|
/// [0, 2] x [0, 3], which is the span of ratio of super-vector shape to
|
|
/// hardware vector shape in our example.
|
|
///
|
|
/// This function instantiates the iteration <2, 1> of vector.transfer_read
|
|
/// into the set of operations in pseudo-MLIR:
|
|
///
|
|
/// ```mlir
|
|
/// #map2 = (d0, d1, d2, d3) -> (d0, d1 + 2, d2, d3 + 1 * 8)
|
|
/// #map3 = #map o #map2 // where o denotes composition
|
|
/// aff0 = affine.apply #map3.0(%i..)
|
|
/// aff1 = affine.apply #map3.1(%i..)
|
|
/// aff2 = affine.apply #map3.2(%i..)
|
|
/// %r = vector.transfer_read(%A, %aff0, %aff1, %aff2):
|
|
// vector<3x32xf32>, memref<?x?x?xf32>
|
|
/// ```
|
|
///
|
|
/// Practical considerations
|
|
/// ========================
|
|
/// For now, `map` is assumed to be the identity map and the indices are
|
|
/// specified just as vector.transfer_read%A[%i0, %i1, %i2, %i3]. This will be
|
|
/// extended in the future once we have a proper Op for vector transfers.
|
|
/// Additionally, the example above is specified in pseudo-MLIR form; once we
|
|
/// have proper support for generic maps we can generate the code and show
|
|
/// actual MLIR.
|
|
///
|
|
/// TODO(ntv): support a concrete AffineMap and compose with it.
|
|
/// TODO(ntv): these implementation details should be captured in a
|
|
/// vectorization trait at the op level directly.
|
|
static SmallVector<mlir::Value *, 8>
|
|
reindexAffineIndices(FuncBuilder *b, VectorType hwVectorType,
|
|
ArrayRef<unsigned> hwVectorInstance,
|
|
ArrayRef<Value *> memrefIndices) {
|
|
auto vectorShape = hwVectorType.getShape();
|
|
assert(hwVectorInstance.size() >= vectorShape.size());
|
|
|
|
unsigned numIndices = memrefIndices.size();
|
|
auto numMemRefIndices = numIndices - hwVectorInstance.size();
|
|
auto numVectorIndices = hwVectorInstance.size() - vectorShape.size();
|
|
|
|
SmallVector<AffineExpr, 8> affineExprs;
|
|
// TODO(ntv): support a concrete map and composition.
|
|
unsigned i = 0;
|
|
// The first numMemRefIndices correspond to AffineForOp that have not been
|
|
// vectorized, the transformation is the identity on those.
|
|
for (i = 0; i < numMemRefIndices; ++i) {
|
|
auto d_i = b->getAffineDimExpr(i);
|
|
affineExprs.push_back(d_i);
|
|
}
|
|
// The next numVectorIndices correspond to super-vector dimensions that
|
|
// do not have a hardware vector dimension counterpart. For those we only
|
|
// need to increment the index by the corresponding hwVectorInstance.
|
|
for (i = numMemRefIndices; i < numMemRefIndices + numVectorIndices; ++i) {
|
|
auto d_i = b->getAffineDimExpr(i);
|
|
auto offset = hwVectorInstance[i - numMemRefIndices];
|
|
affineExprs.push_back(d_i + offset);
|
|
}
|
|
// The remaining indices correspond to super-vector dimensions that
|
|
// have a hardware vector dimension counterpart. For those we to increment the
|
|
// index by "hwVectorInstance" multiples of the corresponding hardware
|
|
// vector size.
|
|
for (; i < numIndices; ++i) {
|
|
auto d_i = b->getAffineDimExpr(i);
|
|
auto offset = hwVectorInstance[i - numMemRefIndices];
|
|
auto stride = vectorShape[i - numMemRefIndices - numVectorIndices];
|
|
affineExprs.push_back(d_i + offset * stride);
|
|
}
|
|
|
|
// Create a bunch of single result AffineApplyOp.
|
|
SmallVector<mlir::Value *, 8> res;
|
|
res.reserve(affineExprs.size());
|
|
for (auto expr : affineExprs) {
|
|
auto map = AffineMap::get(numIndices, 0, expr, {});
|
|
res.push_back(makeComposedAffineApply(b, b->getInsertionPoint()->getLoc(),
|
|
map, memrefIndices));
|
|
}
|
|
return res;
|
|
}
|
|
|
|
/// Returns attributes with the following substitutions applied:
|
|
/// - constant splat is replaced by constant splat of `hwVectorType`.
|
|
/// TODO(ntv): add more substitutions on a per-need basis.
|
|
static SmallVector<NamedAttribute, 1>
|
|
materializeAttributes(Operation *opInst, VectorType hwVectorType) {
|
|
SmallVector<NamedAttribute, 1> res;
|
|
for (auto a : opInst->getAttrs()) {
|
|
if (auto splat = a.second.dyn_cast<SplatElementsAttr>()) {
|
|
auto attr = SplatElementsAttr::get(hwVectorType, splat.getValue());
|
|
res.push_back(NamedAttribute(a.first, attr));
|
|
} else {
|
|
res.push_back(a);
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
/// Creates an instantiated version of `opInst`.
|
|
/// Ops other than VectorTransferReadOp/VectorTransferWriteOp require no
|
|
/// affine reindexing. Just substitute their Value operands and be done. For
|
|
/// this case the actual instance is irrelevant. Just use the values in
|
|
/// substitutionsMap.
|
|
///
|
|
/// If the underlying substitution fails, this fails too and returns nullptr.
|
|
static Operation *instantiate(FuncBuilder *b, Operation *opInst,
|
|
VectorType hwVectorType,
|
|
DenseMap<Value *, Value *> *substitutionsMap) {
|
|
assert(!opInst->isa<VectorTransferReadOp>() &&
|
|
"Should call the function specialized for VectorTransferReadOp");
|
|
assert(!opInst->isa<VectorTransferWriteOp>() &&
|
|
"Should call the function specialized for VectorTransferWriteOp");
|
|
if (opInst->getNumRegions() != 0)
|
|
return nullptr;
|
|
|
|
bool fail = false;
|
|
auto operands = map(
|
|
[hwVectorType, substitutionsMap, &fail](Value *v) -> Value * {
|
|
auto *res =
|
|
fail ? nullptr : substitute(v, hwVectorType, substitutionsMap);
|
|
fail |= !res;
|
|
return res;
|
|
},
|
|
opInst->getOperands());
|
|
if (fail)
|
|
return nullptr;
|
|
|
|
auto attrs = materializeAttributes(opInst, hwVectorType);
|
|
|
|
OperationState state(b->getContext(), opInst->getLoc(),
|
|
opInst->getName().getStringRef(), operands,
|
|
{hwVectorType}, attrs);
|
|
return b->createOperation(state);
|
|
}
|
|
|
|
/// Computes the permutationMap required for a VectorTransferOp from the memref
|
|
/// to the `hwVectorType`.
|
|
/// This is achieved by returning the projection of the permutationMap along the
|
|
/// dimensions of the super-vector type that remain in the hwVectorType.
|
|
/// In particular, if a dimension is fully instantiated (i.e. unrolled) then it
|
|
/// is projected out in the final result.
|
|
template <typename VectorTransferOpTy>
|
|
static AffineMap projectedPermutationMap(VectorTransferOpTy transfer,
|
|
VectorType hwVectorType) {
|
|
static_assert(
|
|
std::is_same<VectorTransferOpTy, VectorTransferReadOp>::value ||
|
|
std::is_same<VectorTransferOpTy, VectorTransferWriteOp>::value,
|
|
"Must be called on a VectorTransferOp");
|
|
auto superVectorType = transfer.getVectorType();
|
|
auto optionalRatio = shapeRatio(superVectorType, hwVectorType);
|
|
assert(optionalRatio &&
|
|
(optionalRatio->size() == superVectorType.getShape().size()) &&
|
|
"Shape and ratio not of the same size");
|
|
unsigned dim = 0;
|
|
SmallVector<AffineExpr, 4> keep;
|
|
MLIRContext *context = transfer.getContext();
|
|
functional::zipApply(
|
|
[&dim, &keep, context](int64_t shape, int64_t ratio) {
|
|
assert(shape >= ratio && "shape dim must be greater than ratio dim");
|
|
if (shape != ratio) {
|
|
// HW vector is not full instantiated along this dim, keep it.
|
|
keep.push_back(getAffineDimExpr(dim, context));
|
|
}
|
|
++dim;
|
|
},
|
|
superVectorType.getShape(), *optionalRatio);
|
|
auto permutationMap = transfer.getPermutationMap();
|
|
LLVM_DEBUG(permutationMap.print(dbgs() << "\npermutationMap: "));
|
|
if (keep.empty()) {
|
|
return permutationMap;
|
|
}
|
|
auto projectionMap = AffineMap::get(optionalRatio->size(), 0, keep, {});
|
|
LLVM_DEBUG(projectionMap.print(dbgs() << "\nprojectionMap: "));
|
|
return simplifyAffineMap(projectionMap.compose(permutationMap));
|
|
}
|
|
|
|
/// Creates an instantiated version of `read` for the instance of
|
|
/// `hwVectorInstance` when lowering from a super-vector type to
|
|
/// `hwVectorType`. `hwVectorInstance` represents one particular instance of
|
|
/// `hwVectorType` int the covering of the super-vector type. For a more
|
|
/// detailed description of the problem, see the description of
|
|
/// reindexAffineIndices.
|
|
static Operation *instantiate(FuncBuilder *b, VectorTransferReadOp read,
|
|
VectorType hwVectorType,
|
|
ArrayRef<unsigned> hwVectorInstance,
|
|
DenseMap<Value *, Value *> *substitutionsMap) {
|
|
SmallVector<Value *, 8> indices =
|
|
map(makePtrDynCaster<Value>(), read.getIndices());
|
|
auto affineIndices =
|
|
reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices);
|
|
auto map = projectedPermutationMap(read, hwVectorType);
|
|
if (!map) {
|
|
return nullptr;
|
|
}
|
|
auto cloned = b->create<VectorTransferReadOp>(read.getLoc(), hwVectorType,
|
|
read.getMemRef(), affineIndices,
|
|
map, read.getPaddingValue());
|
|
return cloned.getOperation();
|
|
}
|
|
|
|
/// Creates an instantiated version of `write` for the instance of
|
|
/// `hwVectorInstance` when lowering from a super-vector type to
|
|
/// `hwVectorType`. `hwVectorInstance` represents one particular instance of
|
|
/// `hwVectorType` int the covering of th3e super-vector type. For a more
|
|
/// detailed description of the problem, see the description of
|
|
/// reindexAffineIndices.
|
|
static Operation *instantiate(FuncBuilder *b, VectorTransferWriteOp write,
|
|
VectorType hwVectorType,
|
|
ArrayRef<unsigned> hwVectorInstance,
|
|
DenseMap<Value *, Value *> *substitutionsMap) {
|
|
SmallVector<Value *, 8> indices =
|
|
map(makePtrDynCaster<Value>(), write.getIndices());
|
|
auto affineIndices =
|
|
reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices);
|
|
auto cloned = b->create<VectorTransferWriteOp>(
|
|
write.getLoc(),
|
|
substitute(write.getVector(), hwVectorType, substitutionsMap),
|
|
write.getMemRef(), affineIndices,
|
|
projectedPermutationMap(write, hwVectorType));
|
|
return cloned.getOperation();
|
|
}
|
|
|
|
/// Returns `true` if op instance is properly cloned and inserted, false
|
|
/// otherwise.
|
|
/// The multi-dimensional `hwVectorInstance` belongs to the shapeRatio of
|
|
/// super-vector type to hw vector type.
|
|
/// A cloned instance of `op` is formed as follows:
|
|
/// 1. vector.transfer_read: the return `superVectorType` is replaced by
|
|
/// `hwVectorType`. Additionally, affine indices are reindexed with
|
|
/// `reindexAffineIndices` using `hwVectorInstance` and vector type
|
|
/// information;
|
|
/// 2. vector.transfer_write: the `valueToStore` type is simply substituted.
|
|
/// Since we operate on a topologically sorted slice, a substitution must
|
|
/// have been registered for non-constant ops. Additionally, affine indices
|
|
/// are reindexed in the same way as for vector.transfer_read;
|
|
/// 3. constant ops are splats of the super-vector type by construction.
|
|
/// They are cloned to a splat on the hw vector type with the same value;
|
|
/// 4. remaining ops are cloned to version of the op that returns a hw vector
|
|
/// type, all operands are substituted according to `substitutions`. Thanks
|
|
/// to the topological order of a slice, the substitution is always
|
|
/// possible.
|
|
///
|
|
/// Returns true on failure.
|
|
static bool instantiateMaterialization(Operation *op,
|
|
MaterializationState *state) {
|
|
LLVM_DEBUG(dbgs() << "\ninstantiate: " << *op);
|
|
|
|
// Create a builder here for unroll-and-jam effects.
|
|
FuncBuilder b(op);
|
|
// AffineApplyOp are ignored: instantiating the proper vector op will take
|
|
// care of AffineApplyOps by composing them properly.
|
|
if (op->isa<AffineApplyOp>()) {
|
|
return false;
|
|
}
|
|
if (op->getNumRegions() != 0)
|
|
return op->emitError("NYI path Op with region"), true;
|
|
|
|
if (auto write = op->dyn_cast<VectorTransferWriteOp>()) {
|
|
auto *clone = instantiate(&b, write, state->hwVectorType,
|
|
state->hwVectorInstance, state->substitutionsMap);
|
|
return clone == nullptr;
|
|
}
|
|
if (auto read = op->dyn_cast<VectorTransferReadOp>()) {
|
|
auto *clone = instantiate(&b, read, state->hwVectorType,
|
|
state->hwVectorInstance, state->substitutionsMap);
|
|
if (!clone) {
|
|
return true;
|
|
}
|
|
state->substitutionsMap->insert(
|
|
std::make_pair(read.getResult(), clone->getResult(0)));
|
|
return false;
|
|
}
|
|
// The only op with 0 results reaching this point must, by construction, be
|
|
// VectorTransferWriteOps and have been caught above. Ops with >= 2 results
|
|
// are not yet supported. So just support 1 result.
|
|
if (op->getNumResults() != 1) {
|
|
return op->emitError("NYI: ops with != 1 results"), true;
|
|
}
|
|
if (op->getResult(0)->getType() != state->superVectorType) {
|
|
return op->emitError("Op does not return a supervector."), true;
|
|
}
|
|
auto *clone =
|
|
instantiate(&b, op, state->hwVectorType, state->substitutionsMap);
|
|
if (!clone) {
|
|
return true;
|
|
}
|
|
state->substitutionsMap->insert(
|
|
std::make_pair(op->getResult(0), clone->getResult(0)));
|
|
return false;
|
|
}
|
|
|
|
/// Takes a slice and rewrites the operations in it so that occurrences
|
|
/// of `superVectorType` are replaced by `hwVectorType`.
|
|
///
|
|
/// Implementation
|
|
/// ==============
|
|
/// 1. computes the shape ratio of super-vector to HW vector shapes. This
|
|
/// gives for each op in the slice, how many instantiations are required
|
|
/// in each dimension;
|
|
/// 2. performs the concrete materialization. Note that in a first
|
|
/// implementation we use full unrolling because it pragmatically removes
|
|
/// the need to explicitly materialize an AllocOp. Thanks to the properties
|
|
/// of super-vectors, this unrolling is always possible and simple:
|
|
/// vectorizing to a super-vector abstraction already achieved the
|
|
/// equivalent of loop strip-mining + loop sinking and encoded this in the
|
|
/// vector type.
|
|
///
|
|
/// Returns true on failure.
|
|
///
|
|
/// TODO(ntv): materialized allocs.
|
|
/// TODO(ntv): full loops + materialized allocs.
|
|
/// TODO(ntv): partial unrolling + materialized allocs.
|
|
static bool emitSlice(MaterializationState *state,
|
|
SetVector<Operation *> *slice) {
|
|
auto ratio = shapeRatio(state->superVectorType, state->hwVectorType);
|
|
assert(ratio.hasValue() &&
|
|
"ratio of super-vector to HW-vector shape is not integral");
|
|
// The number of integer points in a hyperrectangular region is:
|
|
// shape[0] * strides[0].
|
|
auto numValueToUnroll = (*ratio)[0] * makeStrides(*ratio)[0];
|
|
// Full unrolling to hardware vectors in a first approximation.
|
|
for (unsigned idx = 0; idx < numValueToUnroll; ++idx) {
|
|
// Fresh RAII instanceIndices and substitutionsMap.
|
|
MaterializationState scopedState = *state;
|
|
scopedState.hwVectorInstance = delinearize(idx, *ratio);
|
|
DenseMap<Value *, Value *> substitutionMap;
|
|
scopedState.substitutionsMap = &substitutionMap;
|
|
// slice are topologically sorted, we can just clone them in order.
|
|
for (auto *op : *slice) {
|
|
auto fail = instantiateMaterialization(op, &scopedState);
|
|
if (fail) {
|
|
op->emitError("Unhandled super-vector materialization failure");
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "\nMLFunction is now\n");
|
|
LLVM_DEBUG((*slice)[0]->getFunction()->print(dbgs()));
|
|
|
|
// slice are topologically sorted, we can just erase them in reverse
|
|
// order. Reverse iterator does not just work simply with an operator*
|
|
// dereference.
|
|
for (int idx = slice->size() - 1; idx >= 0; --idx) {
|
|
LLVM_DEBUG(dbgs() << "\nErase: ");
|
|
LLVM_DEBUG((*slice)[idx]->print(dbgs()));
|
|
(*slice)[idx]->erase();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// Materializes super-vector types into concrete hw vector types as follows:
|
|
/// 1. start from super-vector terminators (current vector.transfer_write
|
|
/// ops);
|
|
/// 2. collect all the operations that can be reached by transitive use-defs
|
|
/// chains;
|
|
/// 3. get the superVectorType for this particular terminator and the
|
|
/// corresponding hardware vector type (for now limited to F32)
|
|
/// TODO(ntv): be more general than F32.
|
|
/// 4. emit the transitive useDef set to operate on the finer-grain vector
|
|
/// types.
|
|
///
|
|
/// Notes
|
|
/// =====
|
|
/// The `slice` is sorted in topological order by construction.
|
|
/// Additionally, this set is limited to operations in the same lexical scope
|
|
/// because we currently disallow vectorization of defs that come from another
|
|
/// scope.
|
|
/// TODO(ntv): please document return value.
|
|
static bool materialize(Function *f, const SetVector<Operation *> &terminators,
|
|
MaterializationState *state) {
|
|
DenseSet<Operation *> seen;
|
|
DominanceInfo domInfo(f);
|
|
for (auto *term : terminators) {
|
|
// Short-circuit test, a given terminator may have been reached by some
|
|
// other previous transitive use-def chains.
|
|
if (seen.count(term) > 0) {
|
|
continue;
|
|
}
|
|
|
|
auto terminator = term->cast<VectorTransferWriteOp>();
|
|
LLVM_DEBUG(dbgs() << "\nFrom terminator:" << *term);
|
|
|
|
// Get the transitive use-defs starting from terminator, limited to the
|
|
// current enclosing scope of the terminator. See the top of the function
|
|
// Note for the justification of this restriction.
|
|
// TODO(ntv): relax scoping constraints.
|
|
auto *enclosingScope = term->getParentOp();
|
|
auto keepIfInSameScope = [enclosingScope, &domInfo](Operation *op) {
|
|
assert(op && "NULL op");
|
|
if (!enclosingScope) {
|
|
// by construction, everyone is always under the top scope (null scope).
|
|
return true;
|
|
}
|
|
return domInfo.properlyDominates(enclosingScope, op);
|
|
};
|
|
SetVector<Operation *> slice =
|
|
getSlice(term, keepIfInSameScope, keepIfInSameScope);
|
|
assert(!slice.empty());
|
|
|
|
// Sanity checks: transitive slice must be completely disjoint from
|
|
// what we have seen so far.
|
|
LLVM_DEBUG(dbgs() << "\nTransitive use-defs:");
|
|
for (auto *ud : slice) {
|
|
LLVM_DEBUG(dbgs() << "\nud:" << *ud);
|
|
assert(seen.count(ud) == 0 &&
|
|
"Transitive use-defs not disjoint from already seen");
|
|
seen.insert(ud);
|
|
}
|
|
|
|
// Emit the current slice.
|
|
// Set scoped super-vector and corresponding hw vector types.
|
|
state->superVectorType = terminator.getVectorType();
|
|
assert((state->superVectorType.getElementType() ==
|
|
FloatType::getF32(term->getContext())) &&
|
|
"Only f32 supported for now");
|
|
state->hwVectorType = VectorType::get(
|
|
state->hwVectorSize, state->superVectorType.getElementType());
|
|
auto fail = emitSlice(state, &slice);
|
|
if (fail) {
|
|
return true;
|
|
}
|
|
LLVM_DEBUG(dbgs() << "\nMLFunction is now\n");
|
|
LLVM_DEBUG(f->print(dbgs()));
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void MaterializeVectorsPass::runOnFunction() {
|
|
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
|
|
NestedPatternContext mlContext;
|
|
|
|
// TODO(ntv): Check to see if this supports arbitrary top-level code.
|
|
Function *f = &getFunction();
|
|
if (f->getBlocks().size() != 1)
|
|
return;
|
|
|
|
using matcher::Op;
|
|
LLVM_DEBUG(dbgs() << "\nMaterializeVectors on Function\n");
|
|
LLVM_DEBUG(f->print(dbgs()));
|
|
|
|
MaterializationState state(hwVectorSize);
|
|
// Get the hardware vector type.
|
|
// TODO(ntv): get elemental type from super-vector type rather than force f32.
|
|
auto subVectorType =
|
|
VectorType::get(hwVectorSize, FloatType::getF32(&getContext()));
|
|
|
|
// Capture terminators; i.e. vector.transfer_write ops involving a strict
|
|
// super-vector of subVectorType.
|
|
auto filter = [subVectorType](Operation &op) {
|
|
if (!op.isa<VectorTransferWriteOp>()) {
|
|
return false;
|
|
}
|
|
return matcher::operatesOnSuperVectorsOf(op, subVectorType);
|
|
};
|
|
auto pat = Op(filter);
|
|
SmallVector<NestedMatch, 8> matches;
|
|
pat.match(f, &matches);
|
|
SetVector<Operation *> terminators;
|
|
for (auto m : matches) {
|
|
terminators.insert(m.getMatchedOperation());
|
|
}
|
|
|
|
if (materialize(f, terminators, &state))
|
|
signalPassFailure();
|
|
}
|
|
|
|
FunctionPassBase *
|
|
mlir::createMaterializeVectorsPass(llvm::ArrayRef<int64_t> vectorSize) {
|
|
return new MaterializeVectorsPass(vectorSize);
|
|
}
|
|
|
|
static PassRegistration<MaterializeVectorsPass>
|
|
pass("affine-materialize-vectors",
|
|
"Materializes super-vectors to vectors of the "
|
|
"proper size for the hardware");
|
|
|
|
#undef DEBUG_TYPE
|