[mlir][transform] Plumb a simplified form of AffineMin folding into t… (#145170)

…ransform.pad-tiling-interface This revision introduces a simple variant of AffineMin folding in makeComposedFoldedAffineApply and makes use of it in transform.pad-tiling-interface. Since this version explicitly call ValueBoundsInterface, it may be too expensive and is only activate behind a flag. It results in better foldings when mixing tiling and padding, including with dynamic shapes.
2025-06-23 12:23:24 +02:00
parent 02d2a1646a
commit d9a99afbfc
4 changed files with 273 additions and 49 deletions
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
@@ -410,9 +410,11 @@ void canonicalizeSetAndOperands(IntegerSet *set,
 /// other AffineApplyOps supplying those operands. The operands of the resulting
 /// AffineApplyOp do not change the length of  AffineApplyOp chains.
 AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map,
-                                      ArrayRef<OpFoldResult> operands);
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin = false);
 AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
-                                      ArrayRef<OpFoldResult> operands);
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin = false);

 /// Constructs an AffineApplyOp that applies `map` to `operands` after composing
 /// the map with the maps of any other AffineApplyOp supplying the operands,
@@ -421,16 +423,19 @@ AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
 /// map.
 OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
                                           AffineMap map,
-                                           ArrayRef<OpFoldResult> operands);
+                                           ArrayRef<OpFoldResult> operands,
+                                           bool composeAffineMin = false);
 /// Variant of `makeComposedFoldedAffineApply` that applies to an expression.
 OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
                                           AffineExpr expr,
-                                           ArrayRef<OpFoldResult> operands);
+                                           ArrayRef<OpFoldResult> operands,
+                                           bool composeAffineMin = false);
 /// Variant of `makeComposedFoldedAffineApply` suitable for multi-result maps.
 /// Note that this may create as many affine.apply operations as the map has
 /// results given that affine.apply must be single-result.
 SmallVector<OpFoldResult> makeComposedFoldedMultiResultAffineApply(
-    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands);
+    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands,
+    bool composeAffineMin = false);

 /// Returns an AffineMinOp obtained by composing `map` and `operands` with
 /// AffineApplyOps supplying those operands.
@@ -459,7 +464,8 @@ OpFoldResult makeComposedFoldedAffineMax(OpBuilder &b, Location loc,
 /// terminal symbol, i.e., a symbol defined at the top level or a block/function
 /// argument.
 void fullyComposeAffineMapAndOperands(AffineMap *map,
-                                      SmallVectorImpl<Value> *operands);
+                                      SmallVectorImpl<Value> *operands,
+                                      bool composeAffineMin = false);

 } // namespace affine
 } // namespace mlir
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -11,12 +11,14 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ShapedOpInterfaces.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
@@ -26,7 +28,9 @@
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/MathExtras.h"
+#include <limits>
 #include <numeric>
 #include <optional>

@@ -1042,6 +1046,62 @@ simplifyMapWithOperands(AffineMap &map, ArrayRef<Value> operands) {
                       map.getContext());
 }

+/// Assuming `dimOrSym` is a quantity in `map` that is defined by `minOp`.
+/// Assuming that the quantity is of the form:
+///   `affine_min(f(x, y), symbolic_cst)`.
+/// This function checks that `0 < affine_min(f(x, y), symbolic_cst)` and
+/// proceeds with replacing the patterns:
+/// ```
+///   dimOrSym.ceildiv(symbolic_cst)
+///   (dimOrSym + symbolic_cst - 1).floordiv(symbolic_cst)
+/// ```
+/// by `1`.
+///
+/// Additionally, allows the caller to pass `affineMinKnownToBeNonNegative` to
+/// inject static information that may not be statically discoverable.
+///
+/// Warning: ValueBoundsConstraintSet::computeConstantBound is needed to check
+/// for the nonnegative case, if `affineMinKnownToBeNonNegative` is false.
+static LogicalResult replaceAffineMinBoundingBoxExpression(
+    AffineMinOp minOp, AffineExpr dimOrSym, AffineMap *map,
+    bool affineMinKnownToBeNonNegative = false) {
+  auto affineMinMap = minOp.getAffineMap();
+  if (!affineMinKnownToBeNonNegative) {
+    ValueRange values = minOp->getOperands();
+    for (unsigned i = 0, e = affineMinMap.getNumResults(); i < e; ++i) {
+      AffineMap row = affineMinMap.getSubMap(ArrayRef<unsigned>{i});
+      FailureOr<int64_t> lowerBound =
+          ValueBoundsConstraintSet::computeConstantBound(
+              presburger::BoundType::LB, {row, values},
+              /*stopCondition=*/nullptr,
+              /*closedUB=*/true);
+      if (failed(lowerBound) || lowerBound.value() <= 0)
+        return failure();
+    }
+  }
+
+  AffineMap initialMap = *map;
+  for (unsigned i = 0, e = affineMinMap.getNumResults(); i != e; ++i) {
+    auto m = affineMinMap.getSubMap(ArrayRef<unsigned>{i});
+    AffineExpr expr = m.getResult(0);
+    if (!expr.isSymbolicOrConstant())
+      continue;
+
+    DenseMap<AffineExpr, AffineExpr> repl;
+    // dimOrSym.ceilDiv(expr) -> 1
+    repl[dimOrSym.ceilDiv(expr)] = getAffineConstantExpr(1, minOp.getContext());
+    // (dimOrSym + expr - 1).floorDiv(expr) -> 1
+    repl[(dimOrSym + expr - 1).floorDiv(expr)] =
+        getAffineConstantExpr(1, minOp.getContext());
+    auto newMap = map->replace(repl);
+    if (newMap == *map)
+      continue;
+    *map = newMap;
+  }
+
+  return success(*map != initialMap);
+}
+
 /// Replace all occurrences of AffineExpr at position `pos` in `map` by the
 /// defining AffineApplyOp expression and operands.
 /// When `dimOrSymbolPosition < dims.size()`, AffineDimExpr@[pos] is replaced.
@@ -1052,10 +1112,13 @@ simplifyMapWithOperands(AffineMap &map, ArrayRef<Value> operands) {
 ///   2. `map` dim and symbols are gradually shifted to higher positions.
 ///   3. Old `dim` and `sym` entries are replaced by nullptr
 /// This avoids the need for any bookkeeping.
+/// If `replaceAffineMin` is set to true, additionally triggers more expensive
+/// replacements involving affine_min operations.
 static LogicalResult replaceDimOrSym(AffineMap *map,
                                     unsigned dimOrSymbolPosition,
                                     SmallVectorImpl<Value> &dims,
-                                     SmallVectorImpl<Value> &syms) {
+                                     SmallVectorImpl<Value> &syms,
+                                     bool replaceAffineMin) {
  MLIRContext *ctx = map->getContext();
  bool isDimReplacement = (dimOrSymbolPosition < dims.size());
  unsigned pos = isDimReplacement ? dimOrSymbolPosition
@@ -1064,6 +1127,13 @@ static LogicalResult replaceDimOrSym(AffineMap *map,
  if (!v)
    return failure();

+  auto minOp = v.getDefiningOp<AffineMinOp>();
+  if (minOp && replaceAffineMin) {
+    AffineExpr dimOrSym = isDimReplacement ? getAffineDimExpr(pos, ctx)
+                                           : getAffineSymbolExpr(pos, ctx);
+    return replaceAffineMinBoundingBoxExpression(minOp, dimOrSym, map);
+  }
+
  auto affineApply = v.getDefiningOp<AffineApplyOp>();
  if (!affineApply)
    return failure();
@@ -1101,7 +1171,8 @@ static LogicalResult replaceDimOrSym(AffineMap *map,
 /// iteratively. Perform canonicalization of map and operands as well as
 /// AffineMap simplification. `map` and `operands` are mutated in place.
 static void composeAffineMapAndOperands(AffineMap *map,
-                                        SmallVectorImpl<Value> *operands) {
+                                        SmallVectorImpl<Value> *operands,
+                                        bool composeAffineMin = false) {
  if (map->getNumResults() == 0) {
    canonicalizeMapAndOperands(map, operands);
    *map = simplifyAffineMap(*map);
@@ -1122,7 +1193,8 @@ static void composeAffineMapAndOperands(AffineMap *map,
  while (true) {
    bool changed = false;
    for (unsigned pos = 0; pos != dims.size() + syms.size(); ++pos)
-      if ((changed |= succeeded(replaceDimOrSym(map, pos, dims, syms))))
+      if ((changed |=
+           succeeded(replaceDimOrSym(map, pos, dims, syms, composeAffineMin))))
        break;
    if (!changed)
      break;
@@ -1163,38 +1235,41 @@ static void composeAffineMapAndOperands(AffineMap *map,
 }

 void mlir::affine::fullyComposeAffineMapAndOperands(
-    AffineMap *map, SmallVectorImpl<Value> *operands) {
+    AffineMap *map, SmallVectorImpl<Value> *operands, bool composeAffineMin) {
  while (llvm::any_of(*operands, [](Value v) {
    return isa_and_nonnull<AffineApplyOp>(v.getDefiningOp());
  })) {
-    composeAffineMapAndOperands(map, operands);
+    composeAffineMapAndOperands(map, operands, composeAffineMin);
  }
 }

 AffineApplyOp
 mlir::affine::makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map,
-                                      ArrayRef<OpFoldResult> operands) {
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin) {
  SmallVector<Value> valueOperands;
  map = foldAttributesIntoMap(b, map, operands, valueOperands);
-  composeAffineMapAndOperands(&map, &valueOperands);
+  composeAffineMapAndOperands(&map, &valueOperands, composeAffineMin);
  assert(map);
  return b.create<AffineApplyOp>(loc, map, valueOperands);
 }

 AffineApplyOp
 mlir::affine::makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
-                                      ArrayRef<OpFoldResult> operands) {
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin) {
  return makeComposedAffineApply(
      b, loc,
      AffineMap::inferFromExprList(ArrayRef<AffineExpr>{e}, b.getContext())
          .front(),
-      operands);
+      operands, composeAffineMin);
 }

 /// Composes the given affine map with the given list of operands, pulling in
 /// the maps from any affine.apply operations that supply the operands.
 static void composeMultiResultAffineMap(AffineMap &map,
-                                        SmallVectorImpl<Value> &operands) {
+                                        SmallVectorImpl<Value> &operands,
+                                        bool composeAffineMin = false) {
  // Compose and canonicalize each expression in the map individually because
  // composition only applies to single-result maps, collecting potentially
  // duplicate operands in a single list with shifted dimensions and symbols.
@@ -1203,7 +1278,8 @@ static void composeMultiResultAffineMap(AffineMap &map,
  for (unsigned i : llvm::seq<unsigned>(0, map.getNumResults())) {
    SmallVector<Value> submapOperands(operands.begin(), operands.end());
    AffineMap submap = map.getSubMap({i});
-    fullyComposeAffineMapAndOperands(&submap, &submapOperands);
+    fullyComposeAffineMapAndOperands(&submap, &submapOperands,
+                                     composeAffineMin);
    canonicalizeMapAndOperands(&submap, &submapOperands);
    unsigned numNewDims = submap.getNumDims();
    submap = submap.shiftDims(dims.size()).shiftSymbols(symbols.size());
@@ -1221,10 +1297,9 @@ static void composeMultiResultAffineMap(AffineMap &map,
  canonicalizeMapAndOperands(&map, &operands);
 }

-OpFoldResult
-mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
-                                            AffineMap map,
-                                            ArrayRef<OpFoldResult> operands) {
+OpFoldResult mlir::affine::makeComposedFoldedAffineApply(
+    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands,
+    bool composeAffineMin) {
  assert(map.getNumResults() == 1 && "building affine.apply with !=1 result");

  // Create new builder without a listener, so that no notification is
@@ -1236,7 +1311,7 @@ mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,

  // Create op.
  AffineApplyOp applyOp =
-      makeComposedAffineApply(newBuilder, loc, map, operands);
+      makeComposedAffineApply(newBuilder, loc, map, operands, composeAffineMin);

  // Get constant operands.
  SmallVector<Attribute> constOperands(applyOp->getNumOperands());
@@ -1256,26 +1331,25 @@ mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
  return llvm::getSingleElement(foldResults);
 }

-OpFoldResult
-mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
-                                            AffineExpr expr,
-                                            ArrayRef<OpFoldResult> operands) {
+OpFoldResult mlir::affine::makeComposedFoldedAffineApply(
+    OpBuilder &b, Location loc, AffineExpr expr,
+    ArrayRef<OpFoldResult> operands, bool composeAffineMin) {
  return makeComposedFoldedAffineApply(
      b, loc,
      AffineMap::inferFromExprList(ArrayRef<AffineExpr>{expr}, b.getContext())
          .front(),
-      operands);
+      operands, composeAffineMin);
 }

 SmallVector<OpFoldResult>
 mlir::affine::makeComposedFoldedMultiResultAffineApply(
-    OpBuilder &b, Location loc, AffineMap map,
-    ArrayRef<OpFoldResult> operands) {
-  return llvm::map_to_vector(llvm::seq<unsigned>(0, map.getNumResults()),
-                             [&](unsigned i) {
-                               return makeComposedFoldedAffineApply(
-                                   b, loc, map.getSubMap({i}), operands);
-                             });
+    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands,
+    bool composeAffineMin) {
+  return llvm::map_to_vector(
+      llvm::seq<unsigned>(0, map.getNumResults()), [&](unsigned i) {
+        return makeComposedFoldedAffineApply(b, loc, map.getSubMap({i}),
+                                             operands, composeAffineMin);
+      });
 }

 template <typename OpTy>
@@ -3024,7 +3098,8 @@ void AffineIfOp::build(OpBuilder &builder, OperationState &result,
 /// `set` by composing the maps of such affine.apply ops with the integer
 /// set constraints.
 static void composeSetAndOperands(IntegerSet &set,
-                                  SmallVectorImpl<Value> &operands) {
+                                  SmallVectorImpl<Value> &operands,
+                                  bool composeAffineMin = false) {
  // We will simply reuse the API of the map composition by viewing the LHSs of
  // the equalities and inequalities of `set` as the affine exprs of an affine
  // map. Convert to equivalent map, compose, and convert back to set.
@@ -3035,7 +3110,7 @@ static void composeSetAndOperands(IntegerSet &set,
                    [](Value v) { return v.getDefiningOp<AffineApplyOp>(); }))
    return;

-  composeAffineMapAndOperands(&map, &operands);
+  composeAffineMapAndOperands(&map, &operands, composeAffineMin);
  set = IntegerSet::get(map.getNumDims(), map.getNumSymbols(), map.getResults(),
                        set.getEqFlags());
 }
--- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -84,7 +84,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
      getDimsToSize(rewriter, indexingSizes, options);

  // For each dimension in the operand's shape, iterate over indexingSizes and
-  // add
+  // add the various term contributions.
  for (const auto &enResults : enumerate(indexingMap.getResults())) {
    int64_t resultIndex = enResults.index();
    AffineMap partialIndexingMap = indexingMap.getSubMap(
@@ -122,7 +122,8 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
        AffineMap composedMap = projectedMap.compose(ceilMap);
        OpFoldResult paddingDimOfr = affine::makeComposedFoldedAffineApply(
            rewriter, loc, composedMap,
-            {indexingSizes[paddingDim], paddingSize});
+            {indexingSizes[paddingDim], paddingSize},
+            /*composeAffineMin=*/true);
        terms.push_back(paddingDimOfr);
      } else {
        // Otherwise just set to paddingSize.
--- a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
@@ -19,7 +19,7 @@ func.func @pad_lhs(
  //      CHECK:     : tensor<?x25xf32> to tensor<?x25xf32>

  //      CHECK:   linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<?x12xf32>, tensor<12x25xf32>) outs(%{{.*}} : tensor<?x25xf32>) -> tensor<?x25xf32>
-  
+
  //      CHECK:   tensor.extract_slice %{{.*}}[0, 0] [%{{.*}}, 25] [1, 1]
  //      CHECK:     : tensor<?x25xf32> to tensor<?x25xf32>
  //      CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0] [%{{.*}}, 25] [1, 1]
@@ -29,8 +29,8 @@ func.func @pad_lhs(
 }

 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module_op
      : (!transform.any_op) -> !transform.any_op

    // Tile to 5 then pad to 8 (supposedly to better hit vector ops).
@@ -71,13 +71,13 @@ module {
    return %0 : tensor<7x11x12xf32>
  }
  module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-      %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["linalg.generic"]} in %module_op : (!transform.any_op) -> !transform.any_op
      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [3, 5] pad_to_multiple_of {
-        padding_dimensions = [0, 2], 
+        padding_dimensions = [0, 2],
        padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
      } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-      transform.yield 
+      transform.yield
    }
  }
 }
@@ -126,13 +126,155 @@ module {
    return %0 : tensor<?x11x?xf32>
  }
  module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-      %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["linalg.generic"]} in %module_op : (!transform.any_op) -> !transform.any_op
      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [3, 5] pad_to_multiple_of {
-        padding_dimensions = [0, 2], 
+        padding_dimensions = [0, 2],
        padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
      } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-      transform.yield 
+      transform.yield
    }
  }
 }
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0] -> (-s0 + (s0 ceildiv 16) * 16)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0, s1] -> (-s1 + (s0 ceildiv 16) * 16)>
+// CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> ((s0 ceildiv 16) * 16)>
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<24x?xf32>, %arg1: tensor<?x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32>
+{
+  //      CHECK: %[[D0_0:.*]] = tensor.dim
+  //      CHECK: %[[H0:.*]] = affine.apply #[[$MAP0]]()[%[[D0_0]]]
+  //      CHECK: tensor.pad %{{.*}} low[0, 0] high[0, %[[H0]]]
+  //      CHECK:   : tensor<24x?xf32> to tensor<24x?xf32>
+
+  //      CHECK: %[[D0_2:.*]] = tensor.dim
+  //      CHECK: %[[H1:.*]] = affine.apply #[[$MAP1]]()[%[[D0_0]], %[[D0_2]]]
+  //      CHECK: tensor.pad %{{.*}} low[0, 0] high[%[[H1]], 0]
+  //      CHECK:   : tensor<?x25xf32> to tensor<?x25xf32>
+  //      CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>)
+
+  //      CHECK:    linalg.matmul ins(%{{.*}}, %{{.*}}: tensor<8x16xf32>, tensor<16x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
+
+  //      CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0] [8, 25] [1, 1]
+  // CHECK-SAME:     : tensor<8x25xf32> into tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x?xf32>, tensor<?x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+
+    // Pad then tile should produce static shapes.
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %matmul to padding_sizes [8, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 2]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    %m, %l0, %l1 = transform.structured.tile_using_for %matmul_padded tile_sizes [8, 0, 16]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    %func = transform.structured.match ops{["func.func"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+    %func2 = transform.apply_registered_pass "resolve-shaped-type-result-dims" to %func
+      : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func2 {
+      transform.apply_patterns.canonicalization
+    } {apply_cse} : !transform.any_op
+    %minmax = transform.structured.match ops{["affine.min", "affine.max"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+    transform.affine.simplify_min_max_affine_ops %minmax : !transform.any_op
+    transform.apply_patterns to %func2 {
+      transform.apply_patterns.canonicalization
+    } {apply_cse} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0) -> (-d0 + 16)>
+
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<24x?xf32>, %arg1: tensor<?x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32>
+{
+  //      CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>)
+  //      CHECK:   %[[MIN:.*]] = affine.min #[[$MAP0]](%{{.*}})
+  //      CHECK:   %[[H0:.*]] = affine.apply #[[$MAP1]](%[[MIN]])
+  //      CHECK:   tensor.pad %{{.*}} low[0, 0] high[0, %[[H0]]]
+  //      CHECK:     : tensor<8x?xf32> to tensor<8x16xf32>
+
+  //      CHECK:   %[[H1:.*]] = affine.apply #[[$MAP1]](%[[MIN]])
+  //      CHECK:   tensor.pad %{{.*}} low[0, 0] high[%[[H1]], 0]
+  //      CHECK:     : tensor<?x25xf32> to tensor<16x25xf32>
+
+  //      CHECK:   linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<8x16xf32>, tensor<16x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
+
+  //      CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0] [8, 25] [1, 1]
+  // CHECK-SAME:     : tensor<8x25xf32> into tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x?xf32>, tensor<?x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+
+    // Tile then pad should produce static shapes.
+    %m, %l0, %l1 = transform.structured.tile_using_for %matmul tile_sizes [8, 0, 16]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %m to padding_sizes [8, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 2]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0) -> (-d0 + 20, 8)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
+// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0) -> (-d0 + 8)>
+// CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0) -> (-d0 + 16)>
+
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<20x?xf32>, %arg1: tensor<?x25xf32>, %arg2: tensor<20x25xf32>)
+     -> tensor<20x25xf32>
+{
+  //      CHECK:   linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<8x16xf32>, tensor<16x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<20x?xf32>, tensor<?x25xf32>) outs(%arg2 : tensor<20x25xf32>) -> tensor<20x25xf32>
+  func.return %0 : tensor<20x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+
+    // Tile then pad should produce static shapes.
+    %m, %l0, %l1 = transform.structured.tile_using_for %matmul tile_sizes [8, 0, 16]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %m to padding_sizes [8, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 2]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
+}
+