Files
clang-p2996/mlir/test/Transforms/pipeline-data-transfer.mlir
River Riddle c8496d292e [mlir] Refactor alias generation to support nested aliases
We currently only support one level of aliases, which isn't great
in situations where an attribute/type can have multiple duplicated
components nested within it(e.g. debuginfo metadata). This commit
refactors alias generation to support nested aliases, which requires
changing alias grouping to take into account the depth of child
aliases, to ensure that attributes/types aren't printed before the
aliases they use.

The only real user facing change here was that we no longer print
0 as an alias suffix, which would be unnecessarily expensive to keep
in the new alias generation method (and isn't that valuable of a
behavior to preserve).

Differential Revision: https://reviews.llvm.org/D136541
2022-10-23 23:59:55 -07:00

381 lines
17 KiB
MLIR

// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-pipeline-data-transfer | FileCheck %s
// -----
// CHECK-DAG: [[$MOD_2:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 2)>
// CHECK-DAG: [[$MAP_MINUS_1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 - 1)>
// CHECK-LABEL: func @loop_nest_dma() {
func.func @loop_nest_dma() {
%A = memref.alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 0>
%Ah = memref.alloc() : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
%tag = memref.alloc() : memref<1 x f32>
%zero = arith.constant 0 : index
%num_elts = arith.constant 32 : index
affine.for %i = 0 to 8 {
affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32>
%v = affine.load %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
%r = "compute"(%v) : (f32) -> (f32)
affine.store %r, %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
affine.for %j = 0 to 32 {
"do_more_compute"(%i, %j) : (index, index) -> ()
}
}
memref.dealloc %tag : memref<1 x f32>
memref.dealloc %Ah : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
return
}
// CHECK: %{{.*}} = memref.alloc() : memref<256xf32>
// CHECK: %{{.*}} = memref.alloc() : memref<2x32xf32, 1>
// CHECK-NEXT: %{{.*}} = memref.alloc() : memref<2x1xf32>
// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
// CHECK-NEXT: affine.for %{{.*}} = 1 to 8 {
// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
// CHECK-NEXT: affine.apply [[$MAP_MINUS_1]](%{{.*}})
// CHECK-NEXT: affine.apply [[$MOD_2]](%{{.*}})
// CHECK-NEXT: affine.apply [[$MOD_2]](%{{.*}})
// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32>
// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
// CHECK-NEXT: "compute"(%{{.*}}) : (f32) -> f32
// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 {
// CHECK-NEXT: "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: affine.apply [[$MAP_MINUS_1]](%{{.*}})
// CHECK-NEXT: affine.apply [[$MOD_2]](%{{.*}})
// CHECK-NEXT: affine.apply [[$MOD_2]](%{{.*}})
// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32>
// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
// CHECK-NEXT: "compute"(%{{.*}}) : (f32) -> f32
// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 {
// CHECK-NEXT: "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: memref.dealloc %{{.*}} : memref<2x1xf32>
// CHECK-NEXT: memref.dealloc %{{.*}} : memref<2x32xf32, 1>
// CHECK-NEXT: return
// CHECK-NEXT:}
// -----
// CHECK-DAG: [[$FLOOR_MOD_2:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 floordiv 4) mod 2)>
// CHECK-DAG: [[$REMAP_SHIFT_MINUS_4:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 - 4)>
// CHECK-LABEL: @loop_step
func.func @loop_step(%arg0: memref<512xf32>,
%arg1: memref<512xf32>) {
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
affine.for %i0 = 0 to 512 step 4 {
%1 = memref.alloc() : memref<4xf32, 1>
%2 = memref.alloc() : memref<1xi32>
affine.dma_start %arg0[%i0], %1[%c0], %2[%c0], %c4,
: memref<512xf32>, memref<4xf32, 1>, memref<1xi32>
affine.dma_wait %2[%c0], %c4 : memref<1xi32>
"compute"(%i0) : (index) -> ()
memref.dealloc %2 : memref<1xi32>
memref.dealloc %1 : memref<4xf32, 1>
}
return
}
// CHECK: [[BUF:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x4xf32, 1>
// CHECK: [[TAG:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x1xi32>
// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
// CHECK-NEXT: affine.for %{{.*}} = 4 to 512 step 4 {
// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
// CHECK-NEXT: affine.apply [[$REMAP_SHIFT_MINUS_4]](%{{.*}})
// CHECK-NEXT: affine.apply [[$FLOOR_MOD_2]](%{{.*}})
// CHECK: affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32>
// CHECK-NEXT: "compute"(%{{.*}}) : (index) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: [[SHIFTED:%[0-9a-zA-Z_]+]] = affine.apply [[$REMAP_SHIFT_MINUS_4]](%{{.*}})
// CHECK-NEXT: %{{.*}} = affine.apply [[$FLOOR_MOD_2]]([[SHIFTED]])
// CHECK: affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32>
// CHECK-NEXT: "compute"(%{{.*}}) : (index) -> ()
// CHECK-NEXT: memref.dealloc [[TAG]] : memref<2x1xi32>
// CHECK-NEXT: memref.dealloc [[BUF]] : memref<2x4xf32, 1>
// CHECK-NEXT: return
// CHECK-NEXT: }
// -----
#map1 = affine_map<(d0, d1) -> ((d0 * 2048 + d1 * 256) floordiv 32)>
#map2 = affine_map<(d0) -> ((d0 * 2048) floordiv 32)>
// CHECK-LABEL: func @loop_dma_nested(%{{.*}}: memref<512x32xvector<8xf32>
func.func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>>, %arg1: memref<512x32xvector<8xf32>>, %arg2: memref<512x32xvector<8xf32>>) {
%num_elts = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
%1 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
%2 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
%3 = memref.alloc() : memref<2xi32>
%4 = memref.alloc() : memref<2xi32>
%5 = memref.alloc() : memref<2xi32>
// Prologue for DMA overlap on arg2.
// CHECK-DAG: [[BUF_ARG2:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x64x4xvector<8xf32>, 2>
// CHECK-DAG: [[TAG_ARG2:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x2xi32>
// CHECK: affine.dma_start %{{.*}}[
// CHECK: affine.for %{{.*}} = 1 to 8 {
affine.for %i0 = 0 to 8 {
%6 = affine.apply #map2(%i0)
affine.dma_start %arg2[%6, %c0], %2[%c0, %c0], %5[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
// Steady state for DMA overlap on arg2
// CHECK: affine.dma_start %{{.*}}[
// CHECK: affine.dma_wait [[TAG_ARG2]]
// Prologue for DMA overlap on arg0, arg1 nested within i0
// CHECK: [[BUF_ARG0:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x64x4xvector<8xf32>, 2>
// CHECK: [[BUF_ARG1:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x64x4xvector<8xf32>, 2>
// CHECK: [[TAG_ARG0:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x2xi32>
// CHECK: [[TAG_ARG1:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x2xi32>
// CHECK: affine.dma_start %{{.*}}[
// CHECK: affine.dma_start %{{.*}}[
// CHECK-NEXT: affine.for %{{.*}} = 1 to 8 {
affine.for %i1 = 0 to 8 {
%7 = affine.apply #map1(%i0, %i1)
%8 = affine.apply #map2(%i1)
affine.dma_start %arg0[%7, %c0], %0[%c0, %c0], %3[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
affine.dma_start %arg1[%8, %c0], %1[%c0, %c0], %4[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
affine.dma_wait %3[%c0], %num_elts : memref<2xi32>
affine.dma_wait %4[%c0], %num_elts : memref<2xi32>
// Steady state for DMA overlap on arg0, arg1
// CHECK: affine.dma_start %{{.*}}[
// CHECK: affine.dma_start %{{.*}}[
// CHECK: affine.dma_wait [[TAG_ARG0]]
// CHECK: affine.dma_wait [[TAG_ARG1]]
// CHECK-NEXT: affine.for %{{.*}} = 0 to 4 {
affine.for %i2 = 0 to 4 {
"foo"() : () -> ()
}
}
// epilogue for arg0, arg1
// CHECK: affine.dma_wait [[TAG_ARG0]]
// CHECK: affine.dma_wait [[TAG_ARG1]]
// CHECK-DAG: memref.dealloc [[TAG_ARG1]] : memref<2x2xi32>
// CHECK-DAG: memref.dealloc [[TAG_ARG0]] : memref<2x2xi32>
// CHECK-DAG: memref.dealloc [[BUF_ARG1]] : memref<2x64x4xvector<8xf32>, 2>
// CHECK-DAG: memref.dealloc [[BUF_ARG0]] : memref<2x64x4xvector<8xf32>, 2>
// epilogue for DMA overlap on %arg2
// CHECK: affine.dma_wait [[TAG_ARG2]]
// Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested.
// CHECK: [[BUF_ARG0_NESTED:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x64x4xvector<8xf32>, 2>
// CHECK: [[BUF_ARG1_NESTED:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x64x4xvector<8xf32>, 2>
// CHECK: [[TAG_ARG0_NESTED:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x2xi32>
// CHECK: [[TAG_ARG1_NESTED:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x2xi32>
// CHECK: affine.dma_start %{{.*}}[
// CHECK: affine.dma_start %{{.*}}[
// CHECK: affine.for %{{.*}} = 1 to 8 {
// CHECK: affine.dma_start %{{.*}}[
// CHECK: affine.dma_start %{{.*}}[
// CHECK: affine.dma_wait [[TAG_ARG0_NESTED]]
// CHECK: affine.dma_wait [[TAG_ARG1_NESTED]]
// CHECK: affine.for %{{.*}} = 0 to 4 {
// CHECK: "foo"() : () -> ()
// CHECK: affine.dma_wait [[TAG_ARG0_NESTED]]
// CHECK: affine.dma_wait [[TAG_ARG1_NESTED]]
// CHECK: affine.for %{{.*}} = 0 to 4 {
}
memref.dealloc %5 : memref<2xi32>
memref.dealloc %4 : memref<2xi32>
memref.dealloc %3 : memref<2xi32>
memref.dealloc %2 : memref<64x4xvector<8xf32>, 2>
memref.dealloc %1 : memref<64x4xvector<8xf32>, 2>
memref.dealloc %0 : memref<64x4xvector<8xf32>, 2>
return
// CHECK: }
// CHECK-DAG: memref.dealloc [[TAG_ARG1_NESTED]] : memref<2x2xi32>
// CHECK-DAG: memref.dealloc [[TAG_ARG0_NESTED]] : memref<2x2xi32>
// CHECK-DAG: memref.dealloc [[BUF_ARG1_NESTED]] : memref<2x64x4xvector<8xf32>, 2>
// CHECK-DAG: memref.dealloc [[BUF_ARG0_NESTED]] : memref<2x64x4xvector<8xf32>, 2>
// CHECK-DAG: memref.dealloc [[TAG_ARG2]] : memref<2x2xi32>
// CHECK-DAG: memref.dealloc [[BUF_ARG2]] : memref<2x64x4xvector<8xf32>, 2>
// CHECK-NEXT: return
}
// -----
#map2 = affine_map<(d0) -> ((d0 * 2048) floordiv 32)>
// CHECK: func @loop_dma_dependent
func.func @loop_dma_dependent(%arg2: memref<512x32xvector<8xf32>>) {
%num_elts = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
%1 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
%2 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
%3 = memref.alloc() : memref<2xi32>
%4 = memref.alloc() : memref<2xi32>
%5 = memref.alloc() : memref<2xi32>
// The two DMAs below are dependent (incoming and outgoing on the same
// memref) in the same iteration; so no pipelining here.
// CHECK-NOT: affine.dma_start
// CHECK: affine.for %{{.*}} = 0 to 8 {
affine.for %i0 = 0 to 8 {
%6 = affine.apply #map2(%i0)
affine.dma_start %arg2[%6, %c0], %2[%c0, %c0], %5[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
affine.dma_start %2[%c0, %c0], %arg2[%6, %c0], %5[%c0], %num_elts : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32>
affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
}
memref.dealloc %5 : memref<2xi32>
memref.dealloc %4 : memref<2xi32>
memref.dealloc %3 : memref<2xi32>
memref.dealloc %2 : memref<64x4xvector<8xf32>, 2>
memref.dealloc %1 : memref<64x4xvector<8xf32>, 2>
memref.dealloc %0 : memref<64x4xvector<8xf32>, 2>
return
}
// -----
// CHECK-LABEL: func @escaping_use
func.func @escaping_use(%arg0: memref<512 x 32 x f32>) {
%c32 = arith.constant 32 : index
%num_elt = arith.constant 512 : index
%zero = arith.constant 0 : index
%Av = memref.alloc() : memref<32 x 32 x f32, 2>
%tag = memref.alloc() : memref<1 x i32>
// CHECK-NOT: affine.dma_start
// CHECK: affine.for %{{.*}} = 0 to 16 {
affine.for %kTT = 0 to 16 {
affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
memref<512 x 32 x f32>,
memref<32 x 32 x f32, 2>, memref<1 x i32>
affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
// escaping use; no DMA pipelining / double buffering will be done.
"foo"(%Av) : (memref<32 x 32 x f32, 2>) -> ()
}
memref.dealloc %tag : memref<1 x i32>
memref.dealloc %Av : memref<32 x 32 x f32, 2>
return
// CHECK: "foo"(%{{[0-9a-zA-Z_]+}}) : (memref<32x32xf32, 2>) -> ()
// CHECK: }
// CHECK: return
}
// -----
// CHECK-LABEL: func @escaping_tag
func.func @escaping_tag(%arg0: memref<512 x 32 x f32>) {
%c32 = arith.constant 32 : index
%num_elt = arith.constant 512 : index
%zero = arith.constant 0 : index
%Av = memref.alloc() : memref<32 x 32 x f32, 2>
%tag = memref.alloc() : memref<1 x i32>
// CHECK-NOT: affine.dma_start
// CHECK: affine.for %{{.*}} = 0 to 16 {
affine.for %kTT = 0 to 16 {
affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
memref<512 x 32 x f32>,
memref<32 x 32 x f32, 2>, memref<1 x i32>
affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
// escaping use; no DMA pipelining / double buffering will be done.
"foo"(%tag) : (memref<1 x i32>) -> ()
}
memref.dealloc %tag : memref<1 x i32>
memref.dealloc %Av : memref<32 x 32 x f32, 2>
return
// CHECK: "foo"(%{{[0-9a-zA-Z_]+}}) : (memref<1xi32>) -> ()
// CHECK: }
// CHECK: return
}
// -----
// CHECK-LABEL: func @live_out_use
func.func @live_out_use(%arg0: memref<512 x 32 x f32>) -> f32 {
%c32 = arith.constant 32 : index
%num_elt = arith.constant 512 : index
%zero = arith.constant 0 : index
%Av = memref.alloc() : memref<32 x 32 x f32, 2>
%tag = memref.alloc() : memref<1 x i32>
// CHECK-NOT: affine.dma_start
// CHECK: affine.for %{{.*}} = 0 to 16 {
affine.for %kTT = 0 to 16 {
affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
memref<512 x 32 x f32>,
memref<32 x 32 x f32, 2>, memref<1 x i32>
affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
}
// Use live out of 'affine.for' op; no DMA pipelining will be done.
%v = affine.load %Av[%zero, %zero] : memref<32 x 32 x f32, 2>
memref.dealloc %tag : memref<1 x i32>
memref.dealloc %Av : memref<32 x 32 x f32, 2>
return %v : f32
// CHECK: affine.load %{{[0-9a-zA-Z_]+}}[%{{.*}}, %{{.*}}] : memref<32x32xf32, 2>
// CHECK: return
}
// -----
// CHECK-LABEL: func @dynamic_shape_dma_buffer
func.func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>, %Av: memref<? x ? x f32, 2>) {
%num_elt = arith.constant 512 : index
%zero = arith.constant 0 : index
%tag = memref.alloc() : memref<1 x i32>
// Double buffering for dynamic shaped buffer.
// Note: Cannot capture C0 because there are multiple C0 constants in the IR.
// CHECK: memref.dim %{{.*}}, %{{.*}} : memref<?x?xf32, 2>
// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index
// CHECK-NEXT: memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32, 2>
// CHECK-NEXT: memref.alloc(%{{.*}}, %{{.*}}) : memref<2x?x?xf32, 2>
// CHECK: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}}
affine.for %kTT = 0 to 16 {
affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
memref<512 x 32 x f32>,
memref<? x ? x f32, 2>, memref<1 x i32>
affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
}
return
// CHECK-NEXT: affine.for %{{.*}} = 1 to 16 {
// CHECK: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}}
// CHECK: affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xi32>
// CHECK: }
// CHECK: affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xi32>
// CHECK: return
}
// Memref replacement will fail here due to a non-dereferencing use. However,
// no incorrect transformation is performed in spite of one of the uses being a
// dereferencing one since replaceAllMemRefUsesWith checks for escaping uses
// before performing any replacement.
// CHECK-LABEL: func @escaping_and_indexed_use_mix
func.func @escaping_and_indexed_use_mix() {
%A = memref.alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 0>
%Ah = memref.alloc() : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
%tag = memref.alloc() : memref<1 x f32>
%zero = arith.constant 0 : index
%num_elts = arith.constant 32 : index
// alloc for the buffer is created but no replacement should happen.
affine.for %i = 0 to 8 {
affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32>
"compute"(%Ah) : (memref<32 x f32, 1>) -> ()
%v = affine.load %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
"foo"(%v) : (f32) -> ()
}
memref.dealloc %A : memref<256 x f32, affine_map<(d0) -> (d0)>, 0>
memref.dealloc %Ah : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
return
}
// No replacement.
// CHECK: affine.for %{{.*}} = 0 to 8 {
// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xf32>
// CHECK-NEXT: "compute"(%{{.*}}) : (memref<32xf32, 1>) -> ()
// CHECK-NEXT: [[VAL:%[0-9a-zA-Z_]+]] = affine.load %{{.*}}[%{{.*}}] : memref<32xf32, 1>
// CHECK-NEXT: "foo"([[VAL]]) : (f32) -> ()