Files
clang-p2996/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
Christopher Bate ced2fc7819 [mlir][bufferization] Fix OneShotBufferize when defaultMemorySpaceFn is used (#91524)
As described in issue llvm/llvm-project#91518, a previous PR
llvm/llvm-project#78484 introduced the `defaultMemorySpaceFn` into
bufferization options, allowing one to inform OneShotBufferize that it
should use a specified function to derive the memory space attribute
from the encoding attribute attached to tensor types.

However, introducing this feature exposed unhandled edge cases,
examples of which are introduced by this change in the new test under

`test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir`.

Fixing the inconsistencies introduced by `defaultMemorySpaceFn` is
pretty simple. This change:

- Updates the `bufferization.to_memref` and `bufferization.to_tensor`
  operations to explicitly include operand and destination types,
  whereas previously they relied on type inference to deduce the
  tensor types. Since the type inference cannot recover the correct
  tensor encoding/memory space, the operand and result types must be
  explicitly included. This is a small assembly format change, but it
  touches a large number of test files.

- Makes minor updates to other bufferization functions to handle the
  changes in building the above ops.

- Updates bufferization of `tensor.from_elements` to handle memory
  space.


Integration/upgrade guide:

In downstream projects, if you have tests or MLIR files that explicitly
use
`bufferization.to_tensor` or `bufferization.to_memref`, then update
them to the new assembly format as follows:

```
%1 = bufferization.to_memref %0 : memref<10xf32>
%2 = bufferization.to_tensor %1 : memref<10xf32>
```

becomes

```
%1 = bufferization.to_memref %0 : tensor<10xf32> to memref<10xf32>
%2 = bufferization.to_tensor %0 : memref<10xf32> to tensor<10xf32> 
```
2024-11-26 09:45:57 -07:00

444 lines
19 KiB
MLIR

// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries" -drop-equivalent-buffer-results -split-input-file | FileCheck %s
// Run fuzzer with different seeds.
// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
// Test bufferization using memref types that have no layout map.
// RUN: mlir-opt %s -one-shot-bufferize="unknown-type-conversion=identity-layout-map bufferize-function-boundaries" -split-input-file -o /dev/null
// CHECK-LABEL: func @insert_slice_fun
// CHECK-SAME: %[[A0:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>,
// CHECK-SAME: %[[A1:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>,
// CHECK-SAME: %[[t0:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>>,
// CHECK-SAME: %[[t1:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>>
func.func @insert_slice_fun(
%A0 : tensor<?xf32> {bufferization.writable = false},
%A1 : tensor<?xf32> {bufferization.writable = true},
%t0 : tensor<4xf32> {bufferization.writable = false},
%t1 : tensor<4xf32> {bufferization.writable = true})
-> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
{
// Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
// CHECK: %[[REALLOC3:.*]] = memref.alloc
// CHECK: memref.copy %[[A0]], %[[REALLOC3]]
// CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC3]]
// CHECK: memref.copy %[[t0]], %[[SV_A0]]
%r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
// Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
// CHECK: %[[REALLOC2:.*]] = memref.alloc
// CHECK: memref.copy %[[A0]]
// CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC2]]
// CHECK: memref.copy %[[t1]], %[[SV_A0_2]]
%r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
// Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice.
// CHECK: %[[REALLOC1:.*]] = memref.alloc
// CHECK: memref.copy %[[A1]]
// CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC1]]
// CHECK: memref.copy %[[t0]], %[[SV_A1]]
%r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
// Do not realloc the large tensor. Copy the tensor.extract_slice.
// CHECK-NOT: alloc
// CHECK: %[[SV_A1_2:.*]] = memref.subview %[[A1]]
// CHECK: memref.copy %[[t1]], %[[SV_A1_2]]
%r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
// CHECK: return %[[REALLOC3]], %[[REALLOC2]], %[[REALLOC1]] :
// CHECK-SAME: memref<?xf32>, memref<?xf32>, memref<?xf32>
return %r0, %r1, %r2, %r3: tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>
}
// -----
// CHECK-LABEL: func @insert_slice_fun
// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>>
func.func @insert_slice_fun(
%A : tensor<?xf32> {bufferization.writable = true},
%t : tensor<4xf32> {bufferization.writable = false})
-> tensor<?xf32>
{
%f0 = arith.constant 0.0 : f32
// CHECK-NOT: alloc
// CHECK: %[[SV_A:.*]] = memref.subview %[[A]]
// CHECK: memref.copy %[[t]], %[[SV_A]]
%r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
/// Overwrite A inplace.
// CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[A]]
%r1 = linalg.fill ins(%f0 : f32) outs(%r0 : tensor<?xf32>) -> tensor<?xf32>
// CHECK: return
// CHECK-NOT: tensor
return %r1: tensor<?xf32>
}
// -----
// CHECK-LABEL: func @insert_slice_fun
// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>>
func.func @insert_slice_fun(
%A : tensor<?xf32> {bufferization.writable = true},
%t : tensor<4xf32> {bufferization.writable = false})
-> tensor<?xf32>
{
%f0 = arith.constant 0.0 : f32
// CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[A]]
%r0 = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>
// CHECK-NOT: alloc
// CHECK: %[[SV_A:.*]] = memref.subview %[[A]]
/// Overwrite A inplace by copying into the subview.
// CHECK: memref.copy %[[t]], %[[SV_A]]
%r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor<?xf32>
// CHECK: return
// CHECK-NOT: tensor
return %r1: tensor<?xf32>
}
// -----
// CHECK-LABEL: func @insert_slice_fun_not_inplace
// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>>
func.func @insert_slice_fun_not_inplace(
%A : tensor<?xf32> {bufferization.writable = false},
%t : tensor<4xf32> {bufferization.writable = false})
-> tensor<?xf32>
{
// CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) {alignment = 64 : i64} : memref<?xf32>
// CHECK: memref.copy %[[A]], %[[ALLOC]] : memref<?xf32{{.*}} to memref<?xf32>
// CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref<?xf32> to memref<4xf32, strided<[1]>>
// CHECK: memref.copy %[[t]], %[[SV]] : memref<4xf32, strided{{.*}}> to memref<4xf32, strided<[1]>>
%r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
// CHECK: return %{{.*}} : memref<?xf32>
return %r0: tensor<?xf32>
}
// -----
// This test case could bufferize in-place with a better analysis. However, it
// is simpler to let the canonicalizer fold away the tensor.insert_slice.
// CHECK-LABEL: func @tensor_cast_not_in_place(
// CHECK-SAME: %[[A:.*]]: memref<?xf32{{.*}}>, %[[B:.*]]: memref<?xf32{{.*}}>
// CHECK: %[[casted:.*]] = memref.cast %[[A]] : memref<?xf32, strided<[?], offset: ?>> to memref<4xf32, strided<[?], offset: ?>>
// CHECK: %[[alloc:.*]] = memref.alloc
// CHECK: memref.copy %[[casted]], %[[alloc]]
// CHECK: %[[subview:.*]] = memref.subview %[[A]][{{.*}}] [4] [1] : {{.*}} to memref<4xf32
// CHECK: memref.copy %[[alloc]], %[[subview]]
func.func @tensor_cast_not_in_place(
%A : tensor<?xf32> {bufferization.writable = true},
%B : tensor<?xf32> {bufferization.writable = false}, %idx: index)
-> (tensor<?xf32>)
{
%r0 = tensor.cast %A : tensor<?xf32> to tensor<4xf32>
%r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
return %r1 : tensor<?xf32>
}
// -----
// CHECK-LABEL: func @insert_op
// CHECK-SAME: %[[t1:.*]]: memref<?xf32, {{.*}}>, %[[s:.*]]: f32, %[[i:.*]]: index
func.func @insert_op(%t1 : tensor<?xf32> {bufferization.writable = true},
%s : f32, %i : index) -> tensor<?xf32> {
// CHECK: memref.store %[[s]], %[[t1]][%[[i]]]
%0 = tensor.insert %s into %t1[%i] : tensor<?xf32>
// CHECK: return
return %0 : tensor<?xf32>
}
// -----
// A regression test to make sure that we handle rank-reducing extract_slice
// correctly.
// CHECK-LABEL: func @rank_reducing
func.func @rank_reducing(
%i: index, %j: index,
%arg0: tensor<8x18x32xf32>)
-> tensor<?x1x6x8xf32> {
%c1 = arith.constant 1 : index
%c6 = arith.constant 6 : index
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = bufferization.alloc_tensor() : tensor<4x1x6x8xf32>
%1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
%2 = bufferization.alloc_tensor() : tensor<1x6x8xf32>
%5 = scf.for %arg7 = %c0 to %c32 step %c8 iter_args(%arg8 = %1) -> (tensor<?x1x6x8xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7)
%8 = tensor.extract_slice %arg0[%i, %j, %arg7] [1, 6, 8] [1, 1, 1] : tensor<8x18x32xf32> to tensor<1x6x8xf32>
%9 = scf.for %arg9 = %c0 to %c6 step %c1 iter_args(%arg10 = %2) -> (tensor<1x6x8xf32>) {
%11 = tensor.extract_slice %8[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x6x8xf32> to tensor<1x1x8xf32>
%12 = tensor.insert_slice %11 into %arg10[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf32> into tensor<1x6x8xf32>
scf.yield %12 : tensor<1x6x8xf32>
}
%10 = tensor.insert_slice %9 into %arg8[%7, 0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] : tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
scf.yield %10 : tensor<?x1x6x8xf32>
}
return %5: tensor<?x1x6x8xf32>
}
// -----
// CHECK-LABEL: func.func @rank_reducing_parallel_insert_slice
func.func @rank_reducing_parallel_insert_slice(%in: tensor<100xf32>, %out: tensor<200x100xf32>) {
%c1 = arith.constant 1 : index
%num_threads = arith.constant 100 : index
// CHECK: scf.forall {{.*}} {
%result = scf.forall (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<200x100xf32> {
%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
scf.forall.in_parallel {
// CHECK: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<100xf32, strided<[?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>>
// CHECK: memref.subview %{{.*}}[1, %{{.*}}] [1, 1] [1, 1] : memref<200x100xf32, strided<[?, ?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>>
tensor.parallel_insert_slice %1 into %o[1, %thread_idx][1, 1][1, 1] :
tensor<1xf32> into tensor<200x100xf32>
}
}
// CHECK: }
return
}
// -----
// CHECK-LABEL: func.func @parallel_insert_full_slice_in_place
// CHECK-NOT: memref.alloc()
func.func @parallel_insert_full_slice_in_place(%2: tensor<2xf32>) -> tensor<2xf32> {
%cst = arith.constant 0.000000e+00 : f32
%3 = scf.forall (%arg0) in (1) shared_outs(%arg2 = %2) -> (tensor<2xf32>) {
%fill = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<2xf32>) -> tensor<2xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %fill into %arg2[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
return %3 : tensor<2xf32>
}
// -----
// This test case could bufferize in-place with a better analysis. However, it
// is simpler to let the canonicalizer fold away the tensor.insert_slice.
// CHECK-LABEL: func @insert_equivalent_tensor
func.func @insert_equivalent_tensor(%t: tensor<10xf32>) -> tensor<10xf32> {
// CHECK: memref.alloc
%cst = arith.constant 4.200000e+01 : f32
// CHECK: linalg.fill
%0 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>) -> tensor<10xf32>
// CHECK: memref.copy
%1 = tensor.insert_slice %0 into %t[0][10][1] : tensor<10xf32> into tensor<10xf32>
return %1 : tensor<10xf32>
}
// -----
// CHECK-LABEL: func @pad_memory_space(
// CHECK-SAME: %[[t:.*]]: memref<?xf32, strided<[?], offset: ?>>
func.func @pad_memory_space(%t: tensor<?xf32>, %h1: index, %f: f32, %pos: index) -> f32
{
// CHECK: %[[alloc_tensor:.*]] = memref.alloc{{.*}} : memref<?xf32, 3>
// CHECK: memref.copy %[[t]], %[[alloc_tensor]]
%0 = bufferization.alloc_tensor() copy(%t)
{memory_space = 3 : i64} : tensor<?xf32>
// CHECK: %[[padded_alloc:.*]] = memref.alloc() {{.*}} : memref<15xf32, 3>
// CHECK: linalg.map
// CHECK: outs(%[[padded_alloc]] : memref<15xf32, 3>)
// CHECK: linalg.yield %{{.*}}
// CHECK: }
// CHECK: %[[subview:.*]] = memref.subview {{.*}} : memref<15xf32, 3> to memref<?xf32, strided<[1], offset: 2>, 3>
// CHECK: memref.copy %[[alloc_tensor]], %[[subview]]
%1 = tensor.pad %0 low[2] high[%h1] {
^bb0(%arg0: index):
tensor.yield %f : f32
} : tensor<?xf32> to tensor<15xf32>
// CHECK: memref.load {{.*}} : memref<15xf32, 3>
%2 = tensor.extract %1[%pos] : tensor<15xf32>
return %2 : f32
}
// -----
// CHECK-LABEL: func @insert_slice_regression(
// CHECK-SAME: %[[t:.*]]: memref<10xf32,{{.*}}>, %[[b:.*]]: memref<5xf32
func.func @insert_slice_regression(%t: tensor<10xf32>, %b: tensor<5xf32>) -> tensor<10xf32> {
%cst = arith.constant 0.0 : f32
%c0 = arith.constant 0 : index
// CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<10xf32>
// CHECK: linalg.fill {{.*}} outs(%[[alloc]] : memref<10xf32>)
%1 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>) -> tensor<10xf32>
// Read %1 so that it does not DCE away.
%vec = vector.transfer_read %1[%c0], %cst : tensor<10xf32>, vector<10xf32>
vector.print %vec : vector<10xf32>
// Write back a different value (not %1).
// CHECK: %[[subview:.*]] = memref.subview %[[t]][0] [5] [1]
// CHECK: memref.copy %[[b]], %[[subview]]
%2 = tensor.insert_slice %b into %t[0][5][1] : tensor<5xf32> into tensor<10xf32>
return %2 : tensor<10xf32>
}
// -----
// CHECK-LABEL: func @insert_slice_full_overwrite(
// CHECK-SAME: %[[t:.*]]: memref<10xf32,{{.*}}>, %[[b:.*]]: memref<10xf32,{{.*}}>
func.func @insert_slice_full_overwrite(%t: tensor<10xf32>, %b: tensor<10xf32>) -> tensor<10xf32> {
%cst = arith.constant 0.0 : f32
%c0 = arith.constant 0 : index
// CHECK: linalg.fill {{.*}} outs(%[[t]] : memref<10xf32,{{.*}}>)
%1 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>) -> tensor<10xf32>
// Read %1 so that it does not DCE away.
%vec = vector.transfer_read %1[%c0], %cst : tensor<10xf32>, vector<10xf32>
vector.print %vec : vector<10xf32>
// Write back a different value (not %1).
// CHECK: %[[subview:.*]] = memref.subview %[[t]][0] [10] [1]
// CHECK: memref.copy %[[b]], %[[subview]]
%2 = tensor.insert_slice %b into %t[0][10][1] : tensor<10xf32> into tensor<10xf32>
return %2 : tensor<10xf32>
}
// -----
// CHECK-LABEL: func @dim_not_reading(
// CHECK-SAME: %[[t:.*]]: memref<?xf32
func.func @dim_not_reading(%t: tensor<?xf32>, %f: f32, %pos: index)
-> (tensor<?xf32>, index)
{
%c0 = arith.constant 0 : index
// CHECK-NOT: memref.alloc
// CHECK-NOT: memref.copy
// CHECK: memref.store %{{.*}}, %[[t]]
%0 = tensor.insert %f into %t[%pos] : tensor<?xf32>
// CHECK: memref.dim %[[t]]
%1 = tensor.dim %t, %c0 : tensor<?xf32>
return %0, %1 : tensor<?xf32>, index
}
// -----
// CHECK: #[[$map:.*]] = affine_map<(d0) -> (d0 + 5)>
// CHECK-LABEL: func.func @cast_retains_buffer_layout(
// CHECK-SAME: %[[t:.*]]: memref<?xf32, #[[$map]]>, %[[sz:.*]]: index) -> memref<?xf32, strided<[1], offset: 7>> {
// CHECK: %[[casted:.*]] = memref.cast %[[t]] : memref<?xf32, #[[$map]]> to memref<10xf32, #[[$map]]>
// CHECK: %[[slice:.*]] = memref.subview %[[casted]][2] [%[[sz]]] [1] : memref<10xf32, #[[$map]]> to memref<?xf32, strided<[1], offset: 7>>
// CHECK: return %[[slice]]
func.func @cast_retains_buffer_layout(
%t: tensor<?xf32>
{bufferization.buffer_layout = affine_map<(d0) -> (d0 + 5)>},
%sz: index)
-> (tensor<10xf32>, tensor<?xf32>)
{
%casted = tensor.cast %t : tensor<?xf32> to tensor<10xf32>
%slice = tensor.extract_slice %casted[2][%sz][1] : tensor<10xf32> to tensor<?xf32>
// Note: The %casted return type is folded away because both buffers are
// equivalent. Therefore, we currently loose some static type information
// in the caller.
return %casted, %slice : tensor<10xf32>, tensor<?xf32>
}
// -----
// CHECK-LABEL: func.func @parallel_insert_slice_source_out_of_place
func.func @parallel_insert_slice_source_out_of_place(%in: tensor<1xf32>, %out: tensor<100xf32>, %f: f32) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%num_threads = arith.constant 50 : index
// CHECK: scf.forall {{.*}} {
%result = scf.forall (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<100xf32> {
// The tensor.insert must bufferize out-of-place.
// CHECK: memref.alloc
// CHECK: memref.store
%insert = tensor.insert %f into %in[%c0] : tensor<1xf32>
%r = tensor.extract %in[%c0] : tensor<1xf32>
vector.print %r : f32
// CHECK: memref.copy
scf.forall.in_parallel {
tensor.parallel_insert_slice %insert into %o[%thread_idx][1][1] :
tensor<1xf32> into tensor<100xf32>
}
}
// CHECK: }
return
}
// -----
// CHECK-LABEL: func @tensor.reshape(
func.func @tensor.reshape() -> tensor<2x2x5xf32> {
// CHECK-DAG: %[[M1:.*]] = memref.cast %{{.*}} : memref<2x10xf32> to memref<?x10xf32>
%t1_static = arith.constant dense<0.> : tensor<2x10xf32>
%t1 = tensor.cast %t1_static : tensor<2x10xf32> to tensor<?x10xf32>
// CHECK: %[[SHAPE:.*]] = memref.get_global @{{.*}} : memref<3xi64>
%shape = arith.constant dense<[2, 2, 5]> : tensor<3xi64>
// CHECK: %[[RESHAPED:.*]] = memref.reshape %[[M1]](%[[SHAPE]]) : (memref<?x10xf32>, memref<3xi64>) -> memref<2x2x5xf32>
%reshaped = tensor.reshape %t1(%shape) : (tensor<?x10xf32>, tensor<3xi64>) -> tensor<2x2x5xf32>
// CHECK: return %[[RESHAPED]]
return %reshaped : tensor<2x2x5xf32>
}
// -----
// CHECK-LABEL: @reshape_with_non_identity_layout(
// CHECK-SAME: %[[INPUT:[a-zA-Z0-9]*]]: memref<2x2xf32, strided<[?, ?], offset: ?>, 3>,
// CHECK-SAME: %[[LAYOUT:[a-zA-Z0-9]*]]: memref<2xi32, strided<[?], offset: ?>>,
func.func @reshape_with_non_identity_layout(%arg0: memref<2x2xf32, strided<[?, ?], offset: ?>, 3>, %arg1: tensor<2xi32>, %idx: index) -> f32 {
%t = bufferization.to_tensor %arg0 restrict : memref<2x2xf32, strided<[?, ?], offset: ?>, 3> to tensor<2x2xf32>
// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[INPUT]][1, 0] [1, 2] [1, 1] : memref<2x2xf32, strided<[?, ?], offset: ?>, 3> to memref<2xf32, strided<[?], offset: ?>, 3>
%extracted_slice = tensor.extract_slice %t[1, 0] [1, 2] [1, 1] : tensor<2x2xf32> to tensor<2xf32>
// To satisify the constraints of memref.reshape, the subview must be
// reallocated a buffer with an identity layout.
// CHECK: %[[ALLOC:.+]] = memref.alloc() {{.*}} : memref<2xf32, 3>
// CHECK: memref.copy %[[SUBVIEW]], %[[ALLOC]]
// CHECK: %[[RESHAPED:.+]] = memref.reshape %[[ALLOC]](%[[LAYOUT]]) : (memref<2xf32, 3>, memref<2xi32, strided<[?], offset: ?>>) -> memref<1x2xf32, 3>
%reshape = tensor.reshape %extracted_slice(%arg1) : (tensor<2xf32>, tensor<2xi32>) -> tensor<1x2xf32>
%r = tensor.extract %reshape[%idx, %idx] : tensor<1x2xf32>
return %r : f32
}
// -----
// CHECK-LABEL: func @collapse_shape_regression(
// CHECK-SAME: %[[t:.*]]: memref<10x20xf32,
func.func @collapse_shape_regression(
%t: tensor<10x20xf32>, %f: f32, %idx: index) {
// CHECK: %[[subview:.*]] = memref.subview %[[t]]
%0 = tensor.extract_slice %t [2, 3] [5, 6] [1, 1]
: tensor<10x20xf32> to tensor<5x6xf32>
// Insert a copy because the original %0 is read later.
// CHECK: %[[alloc1:.*]] = memref.alloc() {{.*}} : memref<5x6xf32>
// CHECK: memref.copy %[[subview]], %[[alloc1]]
// CHECK: memref.store {{.*}}, %[[alloc1]]
tensor.insert %f into %0[%idx, %idx] : tensor<5x6xf32>
// Insert a copy because the shape cannot be collapsed.
// CHECK: %[[alloc2:.*]] = memref.alloc() {{.*}} : memref<5x6xf32>
// CHECK: memref.copy %[[subview]], %[[alloc2]]
// CHECK: memref.collapse_shape %[[alloc2]]
tensor.collapse_shape %0[[0, 1]] : tensor<5x6xf32> into tensor<30xf32>
return
}