[mlir][linalg] Improve implementation of hoist padding.

Instead of relying on adhoc bounds calculations, use a projection-based
implementation. This simplifies the implementation and finds more static
constant sizes than previously/

Differential Revision: https://reviews.llvm.org/D106054
This commit is contained in:
Nicolas Vasilache
2021-07-15 09:56:50 +00:00
parent 5024fe9306
commit 01bdb0f75e
5 changed files with 335 additions and 125 deletions

View File

@@ -0,0 +1,67 @@
//===- ConstraintsSet.h - Extensions for FlatAffineConstraints --*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Linalg-specific constraints set extensions.
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_DIALECT_LINALG_ANALYSIS_CONSTRAINTS_SET_H_
#define MLIR_DIALECT_LINALG_ANALYSIS_CONSTRAINTS_SET_H_
#include "mlir/Analysis/AffineStructures.h"
#include "mlir/IR/AffineMap.h"
namespace mlir {
class ValueRange;
/// Linalg-specific constraints set extensions.
class ConstraintsSet : public FlatAffineConstraints {
public:
ConstraintsSet() : FlatAffineConstraints() {}
/// Assuming `val` is defined by `val = affine.min map (operands)`, introduce
/// all the constraints `val >= expr_i(operands)`, where expr_i are all the
/// results of `map`.
// This API avoids taking a dependence on the AffineMinOp definition.
LogicalResult composeMin(Value val, AffineMap map, ValueRange operands) {
return composeMinOrMaxMapAndOperands(val, map, operands, /*min=*/true);
}
/// Assuming `val` is defined by `val = affine.max map (operands)`, introduce
/// all the constraints `val <= expr_i(operands)`, where expr_i are all the
/// results of `map`.
// This API avoids taking a dependence on the AffineMaxOp definition.
LogicalResult composeMax(Value val, AffineMap map, ValueRange operands) {
return composeMinOrMaxMapAndOperands(val, map, operands, /*min=*/false);
}
/// Assuming `val` is defined by `val = affine.apply map (operands)`, call
/// composeMap.
// This API avoids taking a dependence on the AffineMApplyOp definition.
LogicalResult composeAffineApply(Value val, AffineMap map,
ValueRange operands);
/// Asserts the identifier `id` is in the constraints set and returns it.
unsigned lookupPos(Value id) const;
/// If v is not in the constraint set, insert it as a dim or symbol depending
/// on `asDim`.
/// Return success if v is of dim id type when `asDim` is true and of symbol
/// id type when `asDim` is false.
/// Return failure otherwise.
LogicalResult ensureIdOfType(Value v, bool asDim);
private:
/// Implementation detail for composeMin/Max.
LogicalResult composeMinOrMaxMapAndOperands(Value val, AffineMap map,
ValueRange operands, bool min);
};
} // namespace mlir
#endif // MLIR_DIALECT_LINALG_ANALYSIS_CONSTRAINTS_SET_H_

View File

@@ -1,12 +1,15 @@
add_mlir_dialect_library(MLIRLinalgAnalysis
ConstraintsSet.cpp
DependenceAnalysis.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Linalg
LINK_LIBS PUBLIC
MLIRAnalysis
MLIRIR
MLIRLinalg
MLIRLoopAnalysis
MLIRMemRef
MLIRStandard
)

View File

@@ -0,0 +1,87 @@
//===- ConstraintsSet.cpp - Extensions for FlatAffineConstraints ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Linalg-specific constraints set extensions.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Linalg/Analysis/ConstraintsSet.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/IR/AffineMap.h"
using namespace mlir;
unsigned ConstraintsSet::lookupPos(Value id) const {
unsigned pos;
if (!findId(id, &pos)) {
llvm::errs() << "Lookup failed: " << id << "\n";
llvm_unreachable("Lookup failed");
}
return pos;
}
LogicalResult ConstraintsSet::ensureIdOfType(Value v, bool asDim) {
if (!containsId(v)) {
if (asDim)
addDimId(getNumDimIds(), v);
else
addSymbolId(getNumSymbolIds(), v);
return success();
}
unsigned pos = lookupPos(v);
return success((asDim && pos < getNumDimIds()) ||
(!asDim && getNumDimIds() <= pos &&
pos < getNumDimIds() + getNumSymbolIds()));
}
LogicalResult ConstraintsSet::composeAffineApply(Value val, AffineMap map,
ValueRange operands) {
AffineValueMap avm(map, operands, val);
return composeMap(&avm);
}
LogicalResult ConstraintsSet::composeMinOrMaxMapAndOperands(Value val,
AffineMap map,
ValueRange operands,
bool min) {
ConstraintsSet localCst;
std::vector<SmallVector<int64_t, 8>> flatExprs;
if (failed(getFlattenedAffineExprs(map, &flatExprs, &localCst)))
return failure();
assert(flatExprs.size() == map.getNumResults() &&
"incorrect number of flattened expressiosn");
// Local vars on a per-need basis.
if (localCst.getNumLocalIds() != 0)
return failure();
// Add one inequality for each result connecting `val` to the other ids in
// `operands`. For instance, uf the expression is:
// `16 * i0 + i1` and
// `min` is true
// add:
// -d_val + 16 * i0 + i1 >= 0.
for (const auto &flatExpr : flatExprs) {
assert(flatExpr.size() >= operands.size() + 1);
SmallVector<int64_t, 8> ineq(getNumCols(), 0);
for (unsigned i = 0, e = operands.size(); i < e; i++)
ineq[lookupPos(operands[i])] = min ? flatExpr[i] : -flatExpr[i];
// Set the coefficient for `d_val`.
ineq[lookupPos(val)] = min ? -1 : 1;
// Set the constant term (upper bound in flatExpr is exclusive).
ineq[getNumCols() - 1] = min ? flatExpr[flatExpr.size() - 1] - 1
: -flatExpr[flatExpr.size() - 1];
// Add the inequality connecting the result of the map to the rest.
addInequality(ineq);
}
return success();
}

View File

@@ -12,8 +12,10 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
#include "mlir/Analysis/AffineStructures.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Linalg/Analysis/ConstraintsSet.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/SCF.h"
@@ -530,97 +532,6 @@ bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
}
/// Compute the tightest lower bound with quantities that are all defined
/// outside of `outer`.
/// Return null if such a bound cannot be computed.
Value computeLoopIndependentLowerBound(OpBuilder &b, scf::ForOp outer,
Value v) {
if (isDefinedOutsideOrConstant(outer, v))
return v;
return Value();
}
/// Compute the tightest upper bound with quantities that are all defined
/// outside of `outer`.
/// Expects all ops in the backward slice of `v` up to `outer` to be either
/// scf.for, affine.min or affine.apply.
static Value computeLoopIndependentUpperBound(OpBuilder &b, scf::ForOp outer,
Value v) {
if (isDefinedOutsideOrConstant(outer, v))
return v;
LLVM_DEBUG(DBGS() << "Begin loopIndependentUpperBound for: " << v << "\n");
bool ok =
backwardsSliceOnlyHasOpsOfType<scf::ForOp, AffineMinOp, AffineApplyOp>(
outer, v);
assert(ok && "expected to only be defined by scf::ForOp and AffineMinOp");
(void)ok;
// Compute a backward slice up to, but not including, `outer`.
SetVector<Operation *> backwardSlice;
getBackwardSlice(v, &backwardSlice,
[&](Operation *op) { return outer->isProperAncestor(op); });
backwardSlice.insert(v.getDefiningOp());
OpBuilder::InsertionGuard g(b);
b.setInsertionPoint(outer);
Value res = v;
BlockAndValueMapping bvm;
for (Operation *op : backwardSlice) {
if (isa<scf::ForOp>(op))
continue;
if (isa<AffineApplyOp>(op)) {
b.clone(*op, bvm);
continue;
}
auto sliceMinOp = cast<AffineMinOp>(op);
GetMinMaxExprFn getSCFMinMax = [&](Value value,
SmallVectorImpl<Value> &dims,
SmallVectorImpl<Value> &symbols) {
return getSCFMinMaxExpr(value, dims, symbols, [&](Operation *op) {
return outer->isAncestor(op);
});
};
// Perform the substitution of the operands of AffineMinOp.
auto mapAndOperands = substituteMin(sliceMinOp, getSCFMinMax);
SmallVector<Value> resultOperands = mapAndOperands.dims;
llvm::append_range(resultOperands, mapAndOperands.symbols);
AffineMap map = mapAndOperands.map;
canonicalizeMapAndOperands(&map, &resultOperands);
map = simplifyAffineMap(map);
res = b.create<AffineMinOp>(
outer->getLoc(), map,
llvm::to_vector<4>(llvm::map_range(resultOperands, [&](Value operand) {
return bvm.lookupOrDefault(operand);
})));
bvm.map(sliceMinOp, res);
}
LLVM_DEBUG(DBGS() << "End loopIndependentUpperBound with: " << res << "\n");
return res;
}
/// Return the number of iterations in the loop (ub - lb).ceilDiv(step).
/// The returned Value is guaranteed not to depend on any loop comprised in
/// [`outer`, `forOp`].
/// Return null if such a loop-independent quantity cannot be computed.
static Value buildLoopTripCount(OpBuilder &b, scf::ForOp outer,
scf::ForOp forOp) {
MLIRContext *ctx = forOp->getContext();
AffineExpr lb, ub, step;
bindDims(ctx, lb, ub);
bindSymbols(ctx, step);
Value lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
ubVal = computeLoopIndependentUpperBound(b, outer, forOp.upperBound()),
stepVal = forOp.step();
if (!lbVal || !ubVal || !stepVal)
return Value();
auto loc = forOp->getLoc();
Value res = b.create<AffineApplyOp>(loc, (ub - lb).ceilDiv(step),
ValueRange{lbVal, ubVal, stepVal});
return res;
}
/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
/// The returned Value is guaranteed not to depend on any loop comprised in
/// [`outer`, `forOp`].
@@ -631,14 +542,135 @@ static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
AffineExpr iv, lb, step;
bindDims(ctx, iv, lb);
bindSymbols(ctx, step);
Value ivVal = forOp.getInductionVar(),
lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
stepVal = forOp.step();
if (!ivVal || !lbVal || !stepVal)
if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) ||
!isDefinedOutsideOrConstant(outer, forOp.step()))
return Value();
Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(),
stepVal = forOp.step();
auto loc = forOp->getLoc();
return b.create<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
ValueRange{ivVal, lbVal, stepVal});
return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
ValueRange{ivVal, lbVal, stepVal});
}
/// Given a set of loops, assumed to be scf::ForOp, create a constraint set
/// containing the inequalities `iv - lb >= 0` and `-iv + ub >= 0` for each
/// loop.
static ConstraintsSet initLoopIvsAndBounds(ArrayRef<Operation *> loops) {
ConstraintsSet constraints;
for (Operation *op : loops)
constraints.addDimId(constraints.getNumDimIds(),
cast<scf::ForOp>(op).getInductionVar());
for (Operation *op : loops)
constraints.addDimId(constraints.getNumDimIds(),
cast<scf::ForOp>(op).lowerBound());
for (Operation *op : loops)
constraints.addDimId(constraints.getNumDimIds(),
cast<scf::ForOp>(op).upperBound());
unsigned numLoops = loops.size();
for (unsigned ivIdx = 0, e = numLoops; ivIdx < e; ++ivIdx) {
// iv - lb >= 0
SmallVector<int64_t, 8> ineqLb(constraints.getNumCols(), 0);
ineqLb[ivIdx] = 1;
ineqLb[ivIdx + numLoops] = -1;
// -iv + ub >= 0
SmallVector<int64_t, 8> ineqUb(constraints.getNumCols(), 0);
ineqUb[ivIdx] = -1;
ineqUb[ivIdx + 2 * numLoops] = 1;
ineqUb[constraints.getNumCols() - 1] = -1;
constraints.addInequality(ineqLb);
constraints.addInequality(ineqUb);
}
return constraints;
}
/// For each loop in `loops`, determine the ops involved in the construction of
/// its upper bound---up to the outerLimit loop--- and fold them as new
/// inequalities in the constraint set.
/// This is achieved by computing the backwardSlice of the loop's upper bound
/// and iteratively folding each op in reverse topological order to guarantee
/// use-def ordering.
/// As operations are folded in, their result is projected out of the
/// constraints set.
/// The following operations are supported:
/// - scf::ForOp are simply skipped.
/// - AffineApplyOp are composed to replace the result by an equality.
/// - AffineMinOp are composed by adding each entry as an upper bound.
/// If any other operation is met, return failure.
// TODO: extend on a per-need basis.
static LogicalResult
foldUpperBoundsIntoConstraintsSet(ConstraintsSet &constraints,
scf::ForOp outerLimit,
ArrayRef<Operation *> loops) {
SetVector<Value> toProjectOut;
for (Operation *loop : loops) {
auto ub = cast<scf::ForOp>(loop).upperBound();
if (isDefinedOutsideOrConstant(outerLimit, ub))
continue;
// Compute a backward slice up to, but not including, `outerLimit`.
SetVector<Operation *> backwardSlice;
getBackwardSlice(ub, &backwardSlice, [&](Operation *op) {
return outerLimit->isProperAncestor(op);
});
backwardSlice.insert(ub.getDefiningOp());
// Iterate over all ops in the slice and compose them in the constraints.
for (Operation *op : llvm::reverse(backwardSlice)) {
if (!isa<scf::ForOp, AffineApplyOp, AffineMinOp>(op))
return failure();
if (isa<scf::ForOp>(op))
continue;
// Ensure there is a
auto ensureIdFailed = [&](Value v) {
return failed(constraints.ensureIdOfType(v, /*asDim=*/true));
};
// Ensure all ids exist and add results for later projection.
if (llvm::any_of(op->getResults(), ensureIdFailed) ||
llvm::any_of(op->getOperands(), ensureIdFailed))
return failure();
// All supported ops have 1 result.
// TODO: extend when needed.
toProjectOut.insert(op->getResult(0));
// Compose supported ops.
if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) {
if (failed(constraints.composeAffineApply(affineApplyOp.getResult(),
affineApplyOp.getAffineMap(),
affineApplyOp.getOperands())))
return failure();
continue;
}
auto affineMinOp = cast<AffineMinOp>(op);
if (failed(constraints.composeMin(affineMinOp.getResult(),
affineMinOp.getAffineMap(),
affineMinOp.operands())))
return failure();
}
}
for (Value v : toProjectOut)
constraints.projectOut(v);
return success();
}
/// Compute dynamic tensor sizes, independent of any value defined inside
/// `outer` and such that every n-D iteration of the packingLoops has its own
/// space (so that each packed buffer has a storage location). This is achieved
/// by computing the extent for each of the packing loops.
static LogicalResult computeBounds(scf::ForOp outer,
ArrayRef<Operation *> packingLoops,
SmallVector<AffineMap> &lbs,
SmallVector<AffineMap> &ubs) {
// Packing loop IVs are introduced as the first positions.
ConstraintsSet constraints = initLoopIvsAndBounds(packingLoops);
if (failed(
foldUpperBoundsIntoConstraintsSet(constraints, outer, packingLoops)))
return failure();
// Compute the bounds of the first positions, assuming the others are fixed.
constraints.getSliceBounds(/*pos=*/0, /*num=*/packingLoops.size(),
outer->getContext(), &lbs, &ubs);
return success();
}
/// Ensure prerequisites that guarantee pad op hoisting can occur.
@@ -725,28 +757,49 @@ hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
assert(outermostEnclosingForOp == backwardSlice.front());
scf::ForOp outer = cast<scf::ForOp>(outermostEnclosingForOp);
if (llvm::any_of(packingLoops, [&](Operation *op) {
scf::ForOp forOp = cast<scf::ForOp>(op);
Value lb = forOp.lowerBound(), ub = forOp.upperBound(),
step = forOp.step();
return !isDefinedOutsideOrConstant(outer, lb) ||
!(isDefinedOutsideOrConstant(outer, ub) ||
backwardsSliceOnlyHasOpsOfType<scf::ForOp, AffineMinOp,
AffineApplyOp>(outer, ub)) ||
!isDefinedOutsideOrConstant(outer, step);
}))
ConstraintsSet constraints = initLoopIvsAndBounds(packingLoops.getArrayRef());
if (failed(foldUpperBoundsIntoConstraintsSet(constraints, outer,
packingLoops.getArrayRef())))
return failure();
unsigned numLoops = packingLoops.size();
SmallVector<AffineMap> lbs(numLoops), ubs(numLoops);
if (failed(computeBounds(outer, packingLoops.getArrayRef(), lbs, ubs)))
return failure();
SmallVector<Value> allValues;
constraints.getAllIdValues(&allValues);
SmallVector<Value> allNonLoopValues(allValues.begin() + numLoops,
allValues.end());
// For each packingLoop, create the extent by (ub - lb).ceilDiv(step).
// IP just before the outermost loop considered that we hoist above.
OpBuilder b(outermostEnclosingForOp);
dynamicTensorSizes =
llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) {
return buildLoopTripCount(b, cast<scf::ForOp>(outermostEnclosingForOp),
cast<scf::ForOp>(op));
}));
// Assert all loop trip counts can be computed.
if (!llvm::all_of(dynamicTensorSizes, [](Value v) { return v; }))
llvm_unreachable("loop independence prerequisite not met");
ImplicitLocOpBuilder b(outer->getLoc(), outer);
assert(packingLoops.size() == lbs.size() && "expected matching lb sizes");
assert(packingLoops.size() == ubs.size() && "expected matching ub sizes");
for (auto it : llvm::zip(packingLoops, lbs, ubs)) {
scf::ForOp loop = cast<scf::ForOp>(std::get<0>(it));
AffineMap lbMap = std::get<1>(it);
AffineMap ubMap = std::get<2>(it);
SmallVector<Value> lbOperands(allNonLoopValues);
canonicalizeMapAndOperands(&lbMap, &lbOperands);
Value lbVal = b.createOrFold<AffineMaxOp>(lbMap, lbOperands);
SmallVector<Value> ubOperands(allNonLoopValues);
canonicalizeMapAndOperands(&ubMap, &ubOperands);
Value ubVal = b.createOrFold<AffineMinOp>(ubMap, ubOperands);
AffineExpr lb, ub, step;
bindDims(b.getContext(), lb, ub);
bindSymbols(b.getContext(), step);
Value res = b.createOrFold<AffineApplyOp>(
(ub - lb).ceilDiv(step),
ValueRange{lbVal, ubVal, cast<scf::ForOp>(loop).step()});
dynamicTensorSizes.push_back(res);
}
return success();
}

View File

@@ -141,8 +141,10 @@ func @matmul_tensors(
// -----
// CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)>
// CHECK-DAG: #[[$MIN_MOD4:[0-9a-z]+]] = affine_map<(d0) -> (4, d0 - ((d0 - 1) floordiv 4) * 4)>
// CHECK-DAG: #[[$MIN_REST4:[0-9a-z]+]] = affine_map<(d0, d1) -> (4, d0 - d1)>
// CHECK-DAG: #[[$MIN_REST2:[0-9a-z]+]] = affine_map<(d0, d1) -> (2, d0 - d1)>
// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
// CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)>
#map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>
@@ -167,20 +169,18 @@ func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
//
// CHECK: %[[MR8:.*]] = affine.min #[[$MIN_REST8]](%[[I]])
// CHECK: %[[D0:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
// CHECK: %[[MM4:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
// CHECK: %[[D1:.*]] = affine.apply #[[$DIV2]](%[[MM4]])
// Init tensor and pack.
// CHECK: %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], %[[D1]], 2] : tensor<?x?x2xf32>
// CHECK: %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_A]]) -> (tensor<?x?x2xf32>) {
// CHECK: %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], 2, 2] : tensor<?x2x2xf32>
// CHECK: %[[CAST_INIT_PACKED_A:.*]] = tensor.cast %[[INIT_PACKED_A]] : tensor<?x2x2xf32> to tensor<?x?x2xf32>
// CHECK: %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[CAST_INIT_PACKED_A]]) -> (tensor<?x?x2xf32>) {
// CHECK: scf.for %[[III:[0-9a-z]+]] =
// CHECK: tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
//
// CHECK: %[[D0_2:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
// CHECK: %[[MM4_2:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
// CHECK: %[[D1_2:.*]] = affine.apply #[[$DIV2]](%[[MM4_2]])
// Init tensor and pack.
// CHECK: %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], %[[D1_2]], 2] : tensor<?x?x2xf32>
// CHECK: %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_B]]) -> (tensor<?x?x2xf32>) {
// CHECK: %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], 2, 2] : tensor<?x2x2xf32>
// CHECK: %[[CAST_INIT_PACKED_B:.*]] = tensor.cast %[[INIT_PACKED_B]] : tensor<?x2x2xf32> to tensor<?x?x2xf32>
// CHECK: %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[CAST_INIT_PACKED_B]]) -> (tensor<?x?x2xf32>) {
// CHECK: scf.for %[[III_2:[0-9a-z]+]] =
// CHECK: tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
// Compute.