This commit makes reductions part of the terminator. Instead of
`scf.yield`, `scf.reduce` now terminates the body of `scf.parallel` ops.
`scf.reduce` may contain an arbitrary number of reductions, with one
region per reduction.
Example:
```mlir
%init = arith.constant 0.0 : f32
%r:2 = scf.parallel (%iv) = (%lb) to (%ub) step (%step) init (%init, %init)
-> f32, f32 {
%elem_to_reduce1 = load %buffer1[%iv] : memref<100xf32>
%elem_to_reduce2 = load %buffer2[%iv] : memref<100xf32>
scf.reduce(%elem_to_reduce1, %elem_to_reduce2 : f32, f32) {
^bb0(%lhs : f32, %rhs: f32):
%res = arith.addf %lhs, %rhs : f32
scf.reduce.return %res : f32
}, {
^bb0(%lhs : f32, %rhs: f32):
%res = arith.mulf %lhs, %rhs : f32
scf.reduce.return %res : f32
}
}
```
`scf.reduce` operations can no longer be interleaved with other ops in
the body of `scf.parallel`. This simplifies the op and makes it possible
to assign the `RecursiveMemoryEffects` trait to `scf.reduce`. (This was
not possible before because the op was not a terminator, causing the op
to be DCE'd.)
53 lines
1.8 KiB
MLIR
53 lines
1.8 KiB
MLIR
// RUN: mlir-opt -verify-diagnostics -ownership-based-buffer-deallocation \
|
|
// RUN: -buffer-deallocation-simplification -split-input-file %s | FileCheck %s
|
|
|
|
func.func @parallel_insert_slice(%arg0: index) {
|
|
%c0 = arith.constant 0 : index
|
|
%alloc = memref.alloc() : memref<2xf32>
|
|
scf.forall (%arg1) in (%arg0) {
|
|
%alloc0 = memref.alloc() : memref<2xf32>
|
|
%0 = memref.load %alloc[%c0] : memref<2xf32>
|
|
linalg.fill ins(%0 : f32) outs(%alloc0 : memref<2xf32>)
|
|
}
|
|
return
|
|
}
|
|
|
|
// CHECK-LABEL: func @parallel_insert_slice
|
|
// CHECK-SAME: (%arg0: index)
|
|
// CHECK: [[ALLOC0:%.+]] = memref.alloc(
|
|
// CHECK: scf.forall
|
|
// CHECK: [[ALLOC1:%.+]] = memref.alloc(
|
|
// CHECK: bufferization.dealloc ([[ALLOC1]] : memref<2xf32>) if (%true
|
|
// CHECK-NOT: retain
|
|
// CHECK: }
|
|
// CHECK: bufferization.dealloc ([[ALLOC0]] : memref<2xf32>) if (%true
|
|
// CHECK-NOT: retain
|
|
|
|
// -----
|
|
|
|
func.func @reduce(%buffer: memref<100xf32>) {
|
|
%init = arith.constant 0.0 : f32
|
|
%c0 = arith.constant 0 : index
|
|
%c1 = arith.constant 1 : index
|
|
scf.parallel (%iv) = (%c0) to (%c1) step (%c1) init (%init) -> f32 {
|
|
%elem_to_reduce = memref.load %buffer[%iv] : memref<100xf32>
|
|
scf.reduce(%elem_to_reduce : f32) {
|
|
^bb0(%lhs : f32, %rhs: f32):
|
|
%alloc = memref.alloc() : memref<2xf32>
|
|
memref.store %lhs, %alloc [%c0] : memref<2xf32>
|
|
memref.store %rhs, %alloc [%c1] : memref<2xf32>
|
|
%0 = memref.load %alloc[%c0] : memref<2xf32>
|
|
%1 = memref.load %alloc[%c1] : memref<2xf32>
|
|
%res = arith.addf %0, %1 : f32
|
|
scf.reduce.return %res : f32
|
|
}
|
|
}
|
|
func.return
|
|
}
|
|
|
|
// CHECK-LABEL: func @reduce
|
|
// CHECK: scf.reduce
|
|
// CHECK: [[ALLOC:%.+]] = memref.alloc(
|
|
// CHECK: bufferization.dealloc ([[ALLOC]] :{{.*}}) if (%true
|
|
// CHECK: scf.reduce.return
|