[mlir][linalg] Improve implementation of hoist padding.

Instead of relying on adhoc bounds calculations, use a projection-based implementation. This simplifies the implementation and finds more static constant sizes than previously/ Differential Revision: https://reviews.llvm.org/D106054
2021-07-15 09:56:50 +00:00
parent 5024fe9306
commit 01bdb0f75e
5 changed files with 335 additions and 125 deletions
--- a/mlir/include/mlir/Dialect/Linalg/Analysis/ConstraintsSet.h
+++ b/mlir/include/mlir/Dialect/Linalg/Analysis/ConstraintsSet.h
@@ -0,0 +1,67 @@
+//===- ConstraintsSet.h - Extensions for FlatAffineConstraints --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Linalg-specific constraints set extensions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_ANALYSIS_CONSTRAINTS_SET_H_
+#define MLIR_DIALECT_LINALG_ANALYSIS_CONSTRAINTS_SET_H_
+
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/IR/AffineMap.h"
+
+namespace mlir {
+class ValueRange;
+
+/// Linalg-specific constraints set extensions.
+class ConstraintsSet : public FlatAffineConstraints {
+public:
+  ConstraintsSet() : FlatAffineConstraints() {}
+
+  /// Assuming `val` is defined by `val = affine.min map (operands)`, introduce
+  /// all the constraints `val >= expr_i(operands)`, where expr_i are all the
+  /// results of `map`.
+  // This API avoids taking a dependence on the AffineMinOp definition.
+  LogicalResult composeMin(Value val, AffineMap map, ValueRange operands) {
+    return composeMinOrMaxMapAndOperands(val, map, operands, /*min=*/true);
+  }
+
+  /// Assuming `val` is defined by `val = affine.max map (operands)`, introduce
+  /// all the constraints `val <= expr_i(operands)`, where expr_i are all the
+  /// results of `map`.
+  // This API avoids taking a dependence on the AffineMaxOp definition.
+  LogicalResult composeMax(Value val, AffineMap map, ValueRange operands) {
+    return composeMinOrMaxMapAndOperands(val, map, operands, /*min=*/false);
+  }
+
+  /// Assuming `val` is defined by `val = affine.apply map (operands)`, call
+  /// composeMap.
+  // This API avoids taking a dependence on the AffineMApplyOp definition.
+  LogicalResult composeAffineApply(Value val, AffineMap map,
+                                   ValueRange operands);
+
+  /// Asserts the identifier `id` is in the constraints set and returns it.
+  unsigned lookupPos(Value id) const;
+
+  /// If v is not in the constraint set, insert it as a dim or symbol depending
+  /// on `asDim`.
+  /// Return success if v is of dim id type when `asDim` is true and of symbol
+  /// id type when `asDim` is false.
+  /// Return failure otherwise.
+  LogicalResult ensureIdOfType(Value v, bool asDim);
+
+private:
+  /// Implementation detail for composeMin/Max.
+  LogicalResult composeMinOrMaxMapAndOperands(Value val, AffineMap map,
+                                              ValueRange operands, bool min);
+};
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_ANALYSIS_CONSTRAINTS_SET_H_
--- a/mlir/lib/Dialect/Linalg/Analysis/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Analysis/CMakeLists.txt
@@ -1,12 +1,15 @@
 add_mlir_dialect_library(MLIRLinalgAnalysis
+  ConstraintsSet.cpp
  DependenceAnalysis.cpp

  ADDITIONAL_HEADER_DIRS
  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Linalg

  LINK_LIBS PUBLIC
+  MLIRAnalysis
  MLIRIR
  MLIRLinalg
+  MLIRLoopAnalysis
  MLIRMemRef
  MLIRStandard
  )
--- a/mlir/lib/Dialect/Linalg/Analysis/ConstraintsSet.cpp
+++ b/mlir/lib/Dialect/Linalg/Analysis/ConstraintsSet.cpp
@@ -0,0 +1,87 @@
+//===- ConstraintsSet.cpp - Extensions for FlatAffineConstraints ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Linalg-specific constraints set extensions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/Analysis/ConstraintsSet.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/IR/AffineMap.h"
+
+using namespace mlir;
+
+unsigned ConstraintsSet::lookupPos(Value id) const {
+  unsigned pos;
+  if (!findId(id, &pos)) {
+    llvm::errs() << "Lookup failed: " << id << "\n";
+    llvm_unreachable("Lookup failed");
+  }
+  return pos;
+}
+
+LogicalResult ConstraintsSet::ensureIdOfType(Value v, bool asDim) {
+  if (!containsId(v)) {
+    if (asDim)
+      addDimId(getNumDimIds(), v);
+    else
+      addSymbolId(getNumSymbolIds(), v);
+    return success();
+  }
+  unsigned pos = lookupPos(v);
+  return success((asDim && pos < getNumDimIds()) ||
+                 (!asDim && getNumDimIds() <= pos &&
+                  pos < getNumDimIds() + getNumSymbolIds()));
+}
+
+LogicalResult ConstraintsSet::composeAffineApply(Value val, AffineMap map,
+                                                 ValueRange operands) {
+  AffineValueMap avm(map, operands, val);
+  return composeMap(&avm);
+}
+
+LogicalResult ConstraintsSet::composeMinOrMaxMapAndOperands(Value val,
+                                                            AffineMap map,
+                                                            ValueRange operands,
+                                                            bool min) {
+  ConstraintsSet localCst;
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  if (failed(getFlattenedAffineExprs(map, &flatExprs, &localCst)))
+    return failure();
+  assert(flatExprs.size() == map.getNumResults() &&
+         "incorrect number of flattened expressiosn");
+
+  // Local vars on a per-need basis.
+  if (localCst.getNumLocalIds() != 0)
+    return failure();
+
+  // Add one inequality for each result connecting `val` to the other ids in
+  // `operands`. For instance, uf the expression is:
+  //   `16 * i0 + i1` and
+  //   `min` is true
+  // add:
+  //  -d_val + 16 * i0 + i1 >= 0.
+  for (const auto &flatExpr : flatExprs) {
+    assert(flatExpr.size() >= operands.size() + 1);
+    SmallVector<int64_t, 8> ineq(getNumCols(), 0);
+    for (unsigned i = 0, e = operands.size(); i < e; i++)
+      ineq[lookupPos(operands[i])] = min ? flatExpr[i] : -flatExpr[i];
+
+    // Set the coefficient for `d_val`.
+    ineq[lookupPos(val)] = min ? -1 : 1;
+
+    // Set the constant term (upper bound in flatExpr is exclusive).
+    ineq[getNumCols() - 1] = min ? flatExpr[flatExpr.size() - 1] - 1
+                                 : -flatExpr[flatExpr.size() - 1];
+
+    // Add the inequality connecting the result of the map to the rest.
+    addInequality(ineq);
+  }
+
+  return success();
+}
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -12,8 +12,10 @@
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
+#include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Linalg/Analysis/ConstraintsSet.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
@@ -530,97 +532,6 @@ bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
  return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
 }

-/// Compute the tightest lower bound with quantities that are all defined
-/// outside of `outer`.
-/// Return null if such a bound cannot be computed.
-Value computeLoopIndependentLowerBound(OpBuilder &b, scf::ForOp outer,
-                                       Value v) {
-  if (isDefinedOutsideOrConstant(outer, v))
-    return v;
-  return Value();
-}
-
-/// Compute the tightest upper bound with quantities that are all defined
-/// outside of `outer`.
-/// Expects all ops in the backward slice of `v` up to `outer` to be either
-/// scf.for, affine.min or affine.apply.
-static Value computeLoopIndependentUpperBound(OpBuilder &b, scf::ForOp outer,
-                                              Value v) {
-  if (isDefinedOutsideOrConstant(outer, v))
-    return v;
-
-  LLVM_DEBUG(DBGS() << "Begin loopIndependentUpperBound for: " << v << "\n");
-
-  bool ok =
-      backwardsSliceOnlyHasOpsOfType<scf::ForOp, AffineMinOp, AffineApplyOp>(
-          outer, v);
-  assert(ok && "expected to only be defined by scf::ForOp and AffineMinOp");
-  (void)ok;
-
-  // Compute a backward slice up to, but not including, `outer`.
-  SetVector<Operation *> backwardSlice;
-  getBackwardSlice(v, &backwardSlice,
-                   [&](Operation *op) { return outer->isProperAncestor(op); });
-  backwardSlice.insert(v.getDefiningOp());
-
-  OpBuilder::InsertionGuard g(b);
-  b.setInsertionPoint(outer);
-  Value res = v;
-  BlockAndValueMapping bvm;
-  for (Operation *op : backwardSlice) {
-    if (isa<scf::ForOp>(op))
-      continue;
-    if (isa<AffineApplyOp>(op)) {
-      b.clone(*op, bvm);
-      continue;
-    }
-    auto sliceMinOp = cast<AffineMinOp>(op);
-    GetMinMaxExprFn getSCFMinMax = [&](Value value,
-                                       SmallVectorImpl<Value> &dims,
-                                       SmallVectorImpl<Value> &symbols) {
-      return getSCFMinMaxExpr(value, dims, symbols, [&](Operation *op) {
-        return outer->isAncestor(op);
-      });
-    };
-    // Perform the substitution of the operands of AffineMinOp.
-    auto mapAndOperands = substituteMin(sliceMinOp, getSCFMinMax);
-    SmallVector<Value> resultOperands = mapAndOperands.dims;
-    llvm::append_range(resultOperands, mapAndOperands.symbols);
-    AffineMap map = mapAndOperands.map;
-    canonicalizeMapAndOperands(&map, &resultOperands);
-    map = simplifyAffineMap(map);
-    res = b.create<AffineMinOp>(
-        outer->getLoc(), map,
-        llvm::to_vector<4>(llvm::map_range(resultOperands, [&](Value operand) {
-          return bvm.lookupOrDefault(operand);
-        })));
-    bvm.map(sliceMinOp, res);
-  }
-  LLVM_DEBUG(DBGS() << "End loopIndependentUpperBound with: " << res << "\n");
-  return res;
-}
-
-/// Return the number of iterations in the loop (ub - lb).ceilDiv(step).
-/// The returned Value is guaranteed not to depend on any loop comprised in
-/// [`outer`, `forOp`].
-/// Return null if such a loop-independent quantity cannot be computed.
-static Value buildLoopTripCount(OpBuilder &b, scf::ForOp outer,
-                                scf::ForOp forOp) {
-  MLIRContext *ctx = forOp->getContext();
-  AffineExpr lb, ub, step;
-  bindDims(ctx, lb, ub);
-  bindSymbols(ctx, step);
-  Value lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
-        ubVal = computeLoopIndependentUpperBound(b, outer, forOp.upperBound()),
-        stepVal = forOp.step();
-  if (!lbVal || !ubVal || !stepVal)
-    return Value();
-  auto loc = forOp->getLoc();
-  Value res = b.create<AffineApplyOp>(loc, (ub - lb).ceilDiv(step),
-                                      ValueRange{lbVal, ubVal, stepVal});
-  return res;
-}
-
 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
 /// The returned Value is guaranteed not to depend on any loop comprised in
 /// [`outer`, `forOp`].
@@ -631,14 +542,135 @@ static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
  AffineExpr iv, lb, step;
  bindDims(ctx, iv, lb);
  bindSymbols(ctx, step);
-  Value ivVal = forOp.getInductionVar(),
-        lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
-        stepVal = forOp.step();
-  if (!ivVal || !lbVal || !stepVal)
+  if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) ||
+      !isDefinedOutsideOrConstant(outer, forOp.step()))
    return Value();
+  Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(),
+        stepVal = forOp.step();
  auto loc = forOp->getLoc();
-  return b.create<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
-                                 ValueRange{ivVal, lbVal, stepVal});
+  return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
+                                       ValueRange{ivVal, lbVal, stepVal});
+}
+
+/// Given a set of loops, assumed to be scf::ForOp, create a constraint set
+/// containing the inequalities `iv - lb >= 0` and `-iv + ub >= 0` for each
+/// loop.
+static ConstraintsSet initLoopIvsAndBounds(ArrayRef<Operation *> loops) {
+  ConstraintsSet constraints;
+  for (Operation *op : loops)
+    constraints.addDimId(constraints.getNumDimIds(),
+                         cast<scf::ForOp>(op).getInductionVar());
+  for (Operation *op : loops)
+    constraints.addDimId(constraints.getNumDimIds(),
+                         cast<scf::ForOp>(op).lowerBound());
+  for (Operation *op : loops)
+    constraints.addDimId(constraints.getNumDimIds(),
+                         cast<scf::ForOp>(op).upperBound());
+  unsigned numLoops = loops.size();
+  for (unsigned ivIdx = 0, e = numLoops; ivIdx < e; ++ivIdx) {
+    // iv - lb >= 0
+    SmallVector<int64_t, 8> ineqLb(constraints.getNumCols(), 0);
+    ineqLb[ivIdx] = 1;
+    ineqLb[ivIdx + numLoops] = -1;
+    // -iv + ub >= 0
+    SmallVector<int64_t, 8> ineqUb(constraints.getNumCols(), 0);
+    ineqUb[ivIdx] = -1;
+    ineqUb[ivIdx + 2 * numLoops] = 1;
+    ineqUb[constraints.getNumCols() - 1] = -1;
+    constraints.addInequality(ineqLb);
+    constraints.addInequality(ineqUb);
+  }
+  return constraints;
+}
+
+/// For each loop in `loops`, determine the ops involved in the construction of
+/// its upper bound---up to the outerLimit loop--- and fold them as new
+/// inequalities in the constraint set.
+/// This is achieved by computing the backwardSlice of the loop's upper bound
+/// and iteratively folding each op in reverse topological order to guarantee
+/// use-def ordering.
+/// As operations are folded in, their result is projected out of the
+/// constraints set.
+/// The following operations are supported:
+///   - scf::ForOp are simply skipped.
+///   - AffineApplyOp are composed to replace the result by an equality.
+///   - AffineMinOp are composed by adding each entry as an upper bound.
+/// If any other operation is met, return failure.
+// TODO: extend on a per-need basis.
+static LogicalResult
+foldUpperBoundsIntoConstraintsSet(ConstraintsSet &constraints,
+                                  scf::ForOp outerLimit,
+                                  ArrayRef<Operation *> loops) {
+  SetVector<Value> toProjectOut;
+  for (Operation *loop : loops) {
+    auto ub = cast<scf::ForOp>(loop).upperBound();
+    if (isDefinedOutsideOrConstant(outerLimit, ub))
+      continue;
+
+    // Compute a backward slice up to, but not including, `outerLimit`.
+    SetVector<Operation *> backwardSlice;
+    getBackwardSlice(ub, &backwardSlice, [&](Operation *op) {
+      return outerLimit->isProperAncestor(op);
+    });
+    backwardSlice.insert(ub.getDefiningOp());
+
+    // Iterate over all ops in the slice and compose them in the constraints.
+    for (Operation *op : llvm::reverse(backwardSlice)) {
+      if (!isa<scf::ForOp, AffineApplyOp, AffineMinOp>(op))
+        return failure();
+      if (isa<scf::ForOp>(op))
+        continue;
+      // Ensure there is a
+      auto ensureIdFailed = [&](Value v) {
+        return failed(constraints.ensureIdOfType(v, /*asDim=*/true));
+      };
+
+      // Ensure all ids exist and add results for later projection.
+      if (llvm::any_of(op->getResults(), ensureIdFailed) ||
+          llvm::any_of(op->getOperands(), ensureIdFailed))
+        return failure();
+
+      // All supported ops have 1 result.
+      // TODO: extend when needed.
+      toProjectOut.insert(op->getResult(0));
+
+      // Compose supported ops.
+      if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) {
+        if (failed(constraints.composeAffineApply(affineApplyOp.getResult(),
+                                                  affineApplyOp.getAffineMap(),
+                                                  affineApplyOp.getOperands())))
+          return failure();
+        continue;
+      }
+      auto affineMinOp = cast<AffineMinOp>(op);
+      if (failed(constraints.composeMin(affineMinOp.getResult(),
+                                        affineMinOp.getAffineMap(),
+                                        affineMinOp.operands())))
+        return failure();
+    }
+  }
+  for (Value v : toProjectOut)
+    constraints.projectOut(v);
+  return success();
+}
+
+/// Compute dynamic tensor sizes, independent of any value defined inside
+/// `outer` and such that every n-D iteration of the packingLoops has its own
+/// space (so that each packed buffer has a storage location). This is achieved
+/// by computing the extent for each of the packing loops.
+static LogicalResult computeBounds(scf::ForOp outer,
+                                   ArrayRef<Operation *> packingLoops,
+                                   SmallVector<AffineMap> &lbs,
+                                   SmallVector<AffineMap> &ubs) {
+  // Packing loop IVs are introduced as the first positions.
+  ConstraintsSet constraints = initLoopIvsAndBounds(packingLoops);
+  if (failed(
+          foldUpperBoundsIntoConstraintsSet(constraints, outer, packingLoops)))
+    return failure();
+  // Compute the bounds of the first positions, assuming the others are fixed.
+  constraints.getSliceBounds(/*pos=*/0, /*num=*/packingLoops.size(),
+                             outer->getContext(), &lbs, &ubs);
+  return success();
 }

 /// Ensure prerequisites that guarantee pad op hoisting can occur.
@@ -725,28 +757,49 @@ hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
  assert(outermostEnclosingForOp == backwardSlice.front());

  scf::ForOp outer = cast<scf::ForOp>(outermostEnclosingForOp);
-  if (llvm::any_of(packingLoops, [&](Operation *op) {
-        scf::ForOp forOp = cast<scf::ForOp>(op);
-        Value lb = forOp.lowerBound(), ub = forOp.upperBound(),
-              step = forOp.step();
-        return !isDefinedOutsideOrConstant(outer, lb) ||
-               !(isDefinedOutsideOrConstant(outer, ub) ||
-                 backwardsSliceOnlyHasOpsOfType<scf::ForOp, AffineMinOp,
-                                                AffineApplyOp>(outer, ub)) ||
-               !isDefinedOutsideOrConstant(outer, step);
-      }))
+
+  ConstraintsSet constraints = initLoopIvsAndBounds(packingLoops.getArrayRef());
+  if (failed(foldUpperBoundsIntoConstraintsSet(constraints, outer,
+                                               packingLoops.getArrayRef())))
    return failure();

+  unsigned numLoops = packingLoops.size();
+  SmallVector<AffineMap> lbs(numLoops), ubs(numLoops);
+  if (failed(computeBounds(outer, packingLoops.getArrayRef(), lbs, ubs)))
+    return failure();
+
+  SmallVector<Value> allValues;
+  constraints.getAllIdValues(&allValues);
+  SmallVector<Value> allNonLoopValues(allValues.begin() + numLoops,
+                                      allValues.end());
+
+  // For each packingLoop, create the extent by (ub - lb).ceilDiv(step).
  // IP just before the outermost loop considered that we hoist above.
-  OpBuilder b(outermostEnclosingForOp);
-  dynamicTensorSizes =
-      llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) {
-        return buildLoopTripCount(b, cast<scf::ForOp>(outermostEnclosingForOp),
-                                  cast<scf::ForOp>(op));
-      }));
-  // Assert all loop trip counts can be computed.
-  if (!llvm::all_of(dynamicTensorSizes, [](Value v) { return v; }))
-    llvm_unreachable("loop independence prerequisite not met");
+  ImplicitLocOpBuilder b(outer->getLoc(), outer);
+  assert(packingLoops.size() == lbs.size() && "expected matching lb sizes");
+  assert(packingLoops.size() == ubs.size() && "expected matching ub sizes");
+  for (auto it : llvm::zip(packingLoops, lbs, ubs)) {
+    scf::ForOp loop = cast<scf::ForOp>(std::get<0>(it));
+    AffineMap lbMap = std::get<1>(it);
+    AffineMap ubMap = std::get<2>(it);
+    SmallVector<Value> lbOperands(allNonLoopValues);
+    canonicalizeMapAndOperands(&lbMap, &lbOperands);
+    Value lbVal = b.createOrFold<AffineMaxOp>(lbMap, lbOperands);
+
+    SmallVector<Value> ubOperands(allNonLoopValues);
+    canonicalizeMapAndOperands(&ubMap, &ubOperands);
+    Value ubVal = b.createOrFold<AffineMinOp>(ubMap, ubOperands);
+
+    AffineExpr lb, ub, step;
+    bindDims(b.getContext(), lb, ub);
+    bindSymbols(b.getContext(), step);
+    Value res = b.createOrFold<AffineApplyOp>(
+        (ub - lb).ceilDiv(step),
+        ValueRange{lbVal, ubVal, cast<scf::ForOp>(loop).step()});
+
+    dynamicTensorSizes.push_back(res);
+  }
+
  return success();
 }

--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@@ -141,8 +141,10 @@ func @matmul_tensors(

 // -----

+
 // CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)>
-// CHECK-DAG: #[[$MIN_MOD4:[0-9a-z]+]] = affine_map<(d0) -> (4, d0 - ((d0 - 1) floordiv 4) * 4)>
+// CHECK-DAG: #[[$MIN_REST4:[0-9a-z]+]] = affine_map<(d0, d1) -> (4, d0 - d1)>
+// CHECK-DAG: #[[$MIN_REST2:[0-9a-z]+]] = affine_map<(d0, d1) -> (2, d0 - d1)>
 // CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
 // CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)>
 #map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>
@@ -167,20 +169,18 @@ func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
  //
  //      CHECK:   %[[MR8:.*]] = affine.min #[[$MIN_REST8]](%[[I]])
  //      CHECK:   %[[D0:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
-  //      CHECK:   %[[MM4:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
-  //      CHECK:   %[[D1:.*]] = affine.apply #[[$DIV2]](%[[MM4]])
  // Init tensor and pack.
-  //      CHECK:   %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], %[[D1]], 2] : tensor<?x?x2xf32>
-  //      CHECK:   %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_A]]) -> (tensor<?x?x2xf32>) {
+  //      CHECK:   %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], 2, 2] : tensor<?x2x2xf32>
+  //      CHECK:   %[[CAST_INIT_PACKED_A:.*]] = tensor.cast %[[INIT_PACKED_A]] : tensor<?x2x2xf32> to tensor<?x?x2xf32>
+  //      CHECK:   %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[CAST_INIT_PACKED_A]]) -> (tensor<?x?x2xf32>) {
  //      CHECK:     scf.for %[[III:[0-9a-z]+]] =
  //      CHECK:       tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
  //
  //      CHECK:   %[[D0_2:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
-  //      CHECK:   %[[MM4_2:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
-  //      CHECK:   %[[D1_2:.*]] = affine.apply #[[$DIV2]](%[[MM4_2]])
  // Init tensor and pack.
-  //      CHECK:   %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], %[[D1_2]], 2] : tensor<?x?x2xf32>
-  //      CHECK:   %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_B]]) -> (tensor<?x?x2xf32>) {
+  //      CHECK:   %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], 2, 2] : tensor<?x2x2xf32>
+  //      CHECK:   %[[CAST_INIT_PACKED_B:.*]] = tensor.cast %[[INIT_PACKED_B]] : tensor<?x2x2xf32> to tensor<?x?x2xf32>
+  //      CHECK:   %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[CAST_INIT_PACKED_B]]) -> (tensor<?x?x2xf32>) {
  //      CHECK:     scf.for %[[III_2:[0-9a-z]+]] =
  //      CHECK:       tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
  // Compute.