// RUN: mlir-opt %s -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize="bufferize-function-boundaries" -cse -canonicalize -split-input-file | FileCheck %s // RUN: mlir-opt %s -eliminate-empty-tensors | FileCheck %s --check-prefix=CHECK-ELIM // CHECK: func @buffer_forwarding_conflict( // CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref // CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index func.func @buffer_forwarding_conflict( %t: tensor {bufferization.buffer_layout = affine_map<(d0) -> (d0)>, bufferization.writable = true}, %sz: index) -> (tensor, tensor) { %f0 = arith.constant 0.0: f32 // CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]]) // CHECK: linalg.fill ins({{.*}} : f32) outs(%[[EXTRACT_SLICE_ALLOC]] : memref) // Alloc is needed for the **first** insert_slice (due to backward traversal during analysis). // CHECK: %[[DIM:.*]] = memref.dim %[[FUNC_ARG]] // This allocs the whole dim to allow for a full clone of t. // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]]) // tensor.empty itself does not alloc but forwards to the **second** // insert_slice. The pass replaces the tensor.empty with an out-of-place // extract_slice. %a = tensor.empty(%sz) : tensor %f = linalg.fill ins(%f0 : f32) outs(%a : tensor) -> tensor // CHECK: memref.copy %[[FUNC_ARG]], %[[ALLOC]] : memref to memref // CHECK: %[[SV0_ALLOC:.*]] = memref.subview %[[ALLOC]][0] [%[[sz]]] [1] : memref to memref> // CHECK: memref.copy %[[EXTRACT_SLICE_ALLOC]], %[[SV0_ALLOC]] : memref to memref> %r0 = tensor.insert_slice %f into %t[0][%sz][1]: tensor into tensor // CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] // CHECK: memref.copy %[[EXTRACT_SLICE_ALLOC]], %[[T_SUBVIEW]] %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor into tensor return %r0, %r1: tensor, tensor } // ----- // CHECK: func @buffer_forwarding_no_conflict( // CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref // CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index func.func @buffer_forwarding_no_conflict( %t: tensor {bufferization.buffer_layout = affine_map<(d0) -> (d0)>, bufferization.writable = true}, %sz: index) -> (tensor) { %f0 = arith.constant 0.0: f32 // tensor.empty itself does not alloc but forwards to the insert_slice. // EmptyTensorOpElimination replaces the tensor.empty with an inplace // extract_slice. // CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] %a = tensor.empty(%sz) : tensor // CHECK: linalg.fill ins({{.*}} : f32) outs(%[[T_SUBVIEW]] : memref) -> tensor // Self-copy canonicalizes away later. %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor into tensor return %r1: tensor } // ----- // CHECK: func @insertion_point_inside_loop( // CHECK-SAME: %[[t:.*]]: memref, %[[sz:.*]]: index) func.func @insertion_point_inside_loop(%t : tensor, %sz : index) -> (tensor) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c5 = arith.constant 5 : index // CHECK-NOT: memref.alloc %blank = tensor.empty() : tensor<5xf32> // CHECK: scf.for %[[iv:.*]] = %{{.*}} to %[[sz]] step %{{.*}} { %r = scf.for %iv = %c0 to %sz step %c5 iter_args(%bb = %t) -> (tensor) { // CHECK: %[[subview:.*]] = memref.subview %[[t]][%[[iv]]] [5] [1] %iv_i32 = arith.index_cast %iv : index to i32 %f = arith.sitofp %iv_i32 : i32 to f32 // CHECK: linalg.fill ins(%{{.*}}{{.*}}outs(%[[subview]] %filled = linalg.fill ins(%f : f32) outs(%blank : tensor<5xf32>) -> tensor<5xf32> // CHECK-NOT: memref.copy %inserted = tensor.insert_slice %filled into %bb[%iv][5][1] : tensor<5xf32> into tensor scf.yield %inserted : tensor } return %r : tensor } // ----- // CHECK: func @insertion_point_outside_loop( // CHECK-SAME: %[[t:.*]]: memref, %[[sz:.*]]: index, %[[idx:.*]]: index) func.func @insertion_point_outside_loop(%t : tensor, %sz : index, %idx : index) -> (tensor) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c5 = arith.constant 5 : index // CHECK-NOT: memref.alloc %blank = tensor.empty() : tensor<5xf32> // CHECK: scf.for %[[iv:.*]] = %{{.*}} to %[[sz]] step %{{.*}} { %r = scf.for %iv = %c0 to %sz step %c5 iter_args(%bb = %t) -> (tensor) { %iv_i32 = arith.index_cast %iv : index to i32 %f = arith.sitofp %iv_i32 : i32 to f32 // CHECK: %[[subview:.*]] = memref.subview %[[t]][%[[idx]]] [5] [1] // CHECK: linalg.fill ins(%{{.*}}{{.*}}outs(%[[subview]] %filled = linalg.fill ins(%f : f32) outs(%blank : tensor<5xf32>) -> tensor<5xf32> // CHECK-NOT: memref.copy %inserted = tensor.insert_slice %filled into %bb[%idx][5][1] : tensor<5xf32> into tensor scf.yield %inserted : tensor } return %r : tensor } // ----- // EmptyTensorElimination does currently not apply to chains where the type is // changing. (Casts are supported.) This test just ensures that we do not crash // or generate IR that does not verify. // CHECK-LABEL: func @shape_mismatch func.func @shape_mismatch(%t: tensor<5x6x128xf32>) -> tensor<5x6x128xf32> { %cst = arith.constant 8.0 : f32 %0 = tensor.empty() : tensor<128xf32> %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128xf32>) -> tensor<128xf32> %2 = tensor.expand_shape %1 [[0, 1, 2]] output_shape [1, 1, 128] : tensor<128xf32> into tensor<1x1x128xf32> %3 = tensor.insert_slice %2 into %t[2, 3, 0][1, 1, 128][1, 1, 1] : tensor<1x1x128xf32> into tensor<5x6x128xf32> return %3 : tensor<5x6x128xf32> } // ----- // CHECK-LABEL: func @cast( // CHECK-SAME: %[[t:.*]]: memref<256xf32, // CHECK: %[[sv:.*]] = memref.subview %[[t]] // CHECK: linalg.fill {{.*}} outs(%[[sv]] // CHECK: return %[[t]] func.func @cast(%t: tensor<256xf32>) -> tensor<256xf32> { %cst = arith.constant 8.0 : f32 %c128 = arith.constant 128 : index %0 = tensor.empty(%c128) : tensor %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor) -> tensor %2 = tensor.cast %1 : tensor to tensor<128xf32> %3 = tensor.insert_slice %2 into %t[2][128][1] : tensor<128xf32> into tensor<256xf32> return %3 : tensor<256xf32> } // ----- // CHECK: func @parallel_insert_slice( // CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref // CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index func.func @parallel_insert_slice( %t: tensor {bufferization.buffer_layout = affine_map<(d0) -> (d0)>, bufferization.writable = true}, %sz: index) -> (tensor) { %f0 = arith.constant 0.0: f32 %c512 = arith.constant 512 : index %r1 = scf.forall (%iv) in (%c512) shared_outs(%o = %t) -> (tensor) { // tensor.empty itself does not alloc but forwards to the insert_slice. // EmptyTensorOpElimination replaces the tensor.empty with an inplace // extract_slice. // CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] %a = tensor.empty(%sz) : tensor // CHECK: linalg.fill ins({{.*}} : f32) outs(%[[T_SUBVIEW]] : memref) -> tensor // Self-copy canonicalizes away later. scf.forall.in_parallel { tensor.parallel_insert_slice %f into %o[42][%sz][1]: tensor into tensor } } return %r1: tensor } // ----- // CHECK-LABEL: func @eleminate_multiple_ops( // CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref // CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index func.func @eleminate_multiple_ops(%t: tensor {bufferization.buffer_layout = affine_map<(d0) -> (d0)>}, %sz: index, %c: i1) -> (tensor) { %cst1 = arith.constant 0.0: f32 %cst2 = arith.constant 1.0: f32 // CHECK: %[[r:.*]] = scf.if %{{.*}} -> (memref %if = scf.if %c -> tensor { // CHECK: %[[T_SUBVIEW_1:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] %a1 = tensor.empty(%sz) : tensor // CHECK: linalg.fill ins({{.*}} : f32) outs(%[[T_SUBVIEW_1]] : memref) -> tensor // CHECK: scf.yield %[[T_SUBVIEW_1]] scf.yield %f1 : tensor } else { // CHECK: %[[T_SUBVIEW_2:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] %a2 = tensor.empty(%sz) : tensor // CHECK: linalg.fill ins({{.*}} : f32) outs(%[[T_SUBVIEW_2]] : memref) -> tensor // CHECK: scf.yield %[[T_SUBVIEW_2]] scf.yield %f2 : tensor } // Self-copy could canonicalize away later. // CHECK: %[[T_SUBVIEW_3:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] // CHECK: memref.copy %[[r]], %[[T_SUBVIEW_3]] %r1 = tensor.insert_slice %if into %t[42][%sz][1]: tensor into tensor return %r1: tensor } // ----- // This is a regression test. Make sure that the tensor.extract_slice is not // eliminated. // CHECK-LABEL: func.func @regression_do_not_eliminate_non_empty( // CHECK: memref.subview // CHECK: memref.subview // CHECK: memref.copy func.func @regression_do_not_eliminate_non_empty( %t: tensor<10xf32>, %t2: tensor<10xf32>) -> tensor<10xf32> { %1 = tensor.extract_slice %t[0] [5] [1] : tensor<10xf32> to tensor<5xf32> %2 = tensor.insert_slice %1 into %t2[1] [5] [1] : tensor<5xf32> into tensor<10xf32> return %2 : tensor<10xf32> } // ----- // This is a regression test. Make sure that there is no crash. // CHECK-LABEL: func.func @regression_insert_of_bbarg( func.func @regression_insert_of_bbarg(%t0: tensor<5xf32>, %t1: tensor<10xf32>) -> tensor<10xf32> { %0 = tensor.insert_slice %t0 into %t1 [2] [5] [1] : tensor<5xf32> into tensor<10xf32> return %0 : tensor<10xf32> } // ----- // This is a regression test. Make sure that there is no crash. // CHECK-LABEL: func.func @regression_eliminate_equivalent_only( func.func @regression_eliminate_equivalent_only(%sz: index, %p: index, %t0: tensor) -> tensor { %c0 = arith.constant 0 : index %c8 = arith.constant 8 : index %c16 = arith.constant 16 : index %27 = tensor.empty(%sz) : tensor %extracted_slice = tensor.extract_slice %27[0, 0] [%p, 8] [1, 1] : tensor to tensor %28 = scf.for %arg4 = %c0 to %c16 step %c8 iter_args(%arg5 = %t0) -> (tensor) { %inserted_slice = tensor.insert_slice %extracted_slice into %27[0, 0] [%sz, 8] [1, 1] : tensor into tensor %extracted_slice_2 = tensor.extract_slice %arg5[%p, %p] [%sz, 8] [1, 1] : tensor to tensor %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%inserted_slice : tensor) outs(%extracted_slice_2 : tensor) { ^bb0(%in: i32, %out: i8): %tr = arith.trunci %in : i32 to i8 linalg.yield %tr : i8 } -> tensor %inserted_slice_3 = tensor.insert_slice %32 into %arg5[%p, %arg4] [%sz, 8] [1, 1] : tensor into tensor scf.yield %inserted_slice_3 : tensor } func.return %28 : tensor } // ----- // CHECK-LABEL: func.func @regression_multiple_insertion_points( // CHECK-NOT: memref.alloc func.func @regression_multiple_insertion_points(%t1: tensor) -> tensor { %empty = tensor.empty() : tensor<2x5xf32> %f0 = arith.constant 5.5 : f32 %0 = "test.foo"() : () -> (index) %1 = "test.bar"() : () -> (index) %filled = linalg.fill ins(%f0 : f32) outs(%empty : tensor<2x5xf32>) -> tensor<2x5xf32> %2 = tensor.insert_slice %filled into %t1 [%0, %1] [2, 5] [1, 1] : tensor<2x5xf32> into tensor return %2 : tensor } // ----- // CHECK-LABEL: func @materialize_in_destination( // CHECK-SAME: %[[m:.*]]: memref<5xf32, strided<[?], offset: ?>>, // CHECK: linalg.fill {{.*}} outs(%[[m]] // CHECK: return %[[m]] func.func @materialize_in_destination(%t: tensor<5xf32>, %f: f32) -> tensor<5xf32> { %0 = tensor.empty() : tensor<5xf32> %filled = linalg.fill ins(%f : f32) outs(%0 : tensor<5xf32>) -> tensor<5xf32> %1 = bufferization.materialize_in_destination %filled in %t : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32> return %1 : tensor<5xf32> } // ----- // CHECK-LABEL: func @materialize_in_destination_buffer( // CHECK-SAME: %[[m:.*]]: memref<5xf32>, // CHECK-NEXT: linalg.fill {{.*}} outs(%[[m]] // CHECK-NEXT: return func.func @materialize_in_destination_buffer(%m: memref<5xf32>, %f: f32) { %0 = tensor.empty() : tensor<5xf32> %filled = linalg.fill ins(%f : f32) outs(%0 : tensor<5xf32>) -> tensor<5xf32> bufferization.materialize_in_destination %filled in restrict writable %m : (tensor<5xf32>, memref<5xf32>) -> () return } // ----- // CHECK-LABEL: func @linalg_copy( // CHECK-SAME: %[[m:.*]]: memref<5xf32, strided<[?], offset: ?>>, // CHECK: linalg.fill {{.*}} outs(%[[m]] // CHECK: return %[[m]] func.func @linalg_copy(%t: tensor<5xf32>, %f: f32) -> tensor<5xf32> { %0 = tensor.empty() : tensor<5xf32> %filled = linalg.fill ins(%f : f32) outs(%0 : tensor<5xf32>) -> tensor<5xf32> %1 = linalg.copy ins(%filled : tensor<5xf32>) outs(%t : tensor<5xf32>) -> tensor<5xf32> return %1 : tensor<5xf32> } // ----- // CHECK-LABEL: func @linalg_copy_empty( // CHECK: %[[ret:.*]] = memref.alloc() // CHECK-NEXT: return %[[ret]] func.func @linalg_copy_empty() -> tensor<26xi32> { %0 = tensor.empty() : tensor<26xi32> %1 = linalg.copy ins(%0 : tensor<26xi32>) outs(%0 : tensor<26xi32>) -> tensor<26xi32> return %1 : tensor<26xi32> } // ----- // CHECK-ELIM-LABEL: func @multiple_materialize_in_destination_buffer( // CHECK-ELIM-SAME: %[[m:.*]]: memref<5xf32> // CHECK-ELIM: tensor.empty // CHECK-ELIM: bufferization.to_tensor %[[m]] restrict writable // CHECK-ELIM: bufferization.materialize_in_destination {{.*}} in writable %[[m]] func.func @multiple_materialize_in_destination_buffer(%m: memref<5xf32>, %f: f32, %f2: f32, %c: i1) { %0 = tensor.empty() : tensor<5xf32> %filled = linalg.fill ins(%f : f32) outs(%0 : tensor<5xf32>) -> tensor<5xf32> %1 = tensor.empty() : tensor<5xf32> %filled2 = linalg.fill ins(%f2 : f32) outs(%1 : tensor<5xf32>) -> tensor<5xf32> %selected = scf.if %c -> tensor<5xf32> { scf.yield %filled : tensor<5xf32> } else { scf.yield %filled2 : tensor<5xf32> } bufferization.materialize_in_destination %selected in restrict writable %m : (tensor<5xf32>, memref<5xf32>) -> () return }