As part of the work on transitioning bufferization dialect, ops, and
associated logic to operate on newly added type interfaces (see
00eaff3e9c), rename the
bufferization.to_memref to highlight the generic nature of the op.
Bufferization process produces buffers while memref is a builtin type
rather than a generic term.
Preserve the current API (to_buffer still produces a memref), however,
as the new type interfaces are not used yet.
669 lines
26 KiB
MLIR
669 lines
26 KiB
MLIR
// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER
|
|
// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{compute-tolerance=0.0}))' -split-input-file | FileCheck %s --check-prefix=ZERO-TOLERANCE
|
|
// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer maximal}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER-MAXIMAL
|
|
// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{maximal mode=sibling}))' -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL
|
|
// All fusion: producer-consumer and sibling.
|
|
// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s --check-prefix=ALL
|
|
// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(spirv.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=SPIRV
|
|
|
|
// Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir.
|
|
// Part II of fusion tests in mlir/test/Transforms/loop-fusion-2.mlir
|
|
// Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir
|
|
|
|
// Expects fusion of producer into consumer at depth 4 and subsequent removal of
|
|
// source loop.
|
|
// PRODUCER-CONSUMER-LABEL: func @unflatten4d
|
|
func.func @unflatten4d(%arg1: memref<7x8x9x10xf32>) {
|
|
%m = memref.alloc() : memref<5040xf32>
|
|
%cf7 = arith.constant 7.0 : f32
|
|
|
|
affine.for %i0 = 0 to 7 {
|
|
affine.for %i1 = 0 to 8 {
|
|
affine.for %i2 = 0 to 9 {
|
|
affine.for %i3 = 0 to 10 {
|
|
affine.store %cf7, %m[720 * %i0 + 90 * %i1 + 10 * %i2 + %i3] : memref<5040xf32>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
affine.for %i0 = 0 to 7 {
|
|
affine.for %i1 = 0 to 8 {
|
|
affine.for %i2 = 0 to 9 {
|
|
affine.for %i3 = 0 to 10 {
|
|
%v0 = affine.load %m[720 * %i0 + 90 * %i1 + 10 * %i2 + %i3] : memref<5040xf32>
|
|
affine.store %v0, %arg1[%i0, %i1, %i2, %i3] : memref<7x8x9x10xf32>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// PRODUCER-CONSUMER: affine.for
|
|
// PRODUCER-CONSUMER-NEXT: affine.for
|
|
// PRODUCER-CONSUMER-NEXT: affine.for
|
|
// PRODUCER-CONSUMER-NEXT: affine.for
|
|
// PRODUCER-CONSUMER-NOT: affine.for
|
|
// PRODUCER-CONSUMER: return
|
|
|
|
// -----
|
|
|
|
// Expects fusion of producer into consumer at depth 2 and subsequent removal of
|
|
// source loop.
|
|
// PRODUCER-CONSUMER-LABEL: func @unflatten2d_with_transpose
|
|
func.func @unflatten2d_with_transpose(%arg1: memref<8x7xf32>) {
|
|
%m = memref.alloc() : memref<56xf32>
|
|
%cf7 = arith.constant 7.0 : f32
|
|
|
|
affine.for %i0 = 0 to 7 {
|
|
affine.for %i1 = 0 to 8 {
|
|
affine.store %cf7, %m[8 * %i0 + %i1] : memref<56xf32>
|
|
}
|
|
}
|
|
affine.for %i0 = 0 to 8 {
|
|
affine.for %i1 = 0 to 7 {
|
|
%v0 = affine.load %m[%i0 + 8 * %i1] : memref<56xf32>
|
|
affine.store %v0, %arg1[%i0, %i1] : memref<8x7xf32>
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// PRODUCER-CONSUMER: affine.for
|
|
// PRODUCER-CONSUMER-NEXT: affine.for
|
|
// PRODUCER-CONSUMER-NOT: affine.for
|
|
// PRODUCER-CONSUMER: return
|
|
|
|
// -----
|
|
|
|
// Expects fusion of producer into consumer at depth 1 and source loop to not
|
|
// be removed due to difference in loop steps.
|
|
// PRODUCER-CONSUMER-LABEL: func @check_src_dst_step
|
|
func.func @check_src_dst_step(%m : memref<100xf32>,
|
|
%src: memref<100xf32>,
|
|
%out: memref<100xf32>) {
|
|
affine.for %i0 = 0 to 100 {
|
|
%r1 = affine.load %src[%i0]: memref<100xf32>
|
|
affine.store %r1, %m[%i0] : memref<100xf32>
|
|
}
|
|
affine.for %i2 = 0 to 100 step 2 {
|
|
%r2 = affine.load %m[%i2] : memref<100xf32>
|
|
affine.store %r2, %out[%i2] : memref<100xf32>
|
|
}
|
|
return
|
|
}
|
|
|
|
// Check if the fusion did take place as well as that the source loop was
|
|
// not removed. To check if fusion took place, the read instruction from the
|
|
// original source loop is checked to be in the fused loop.
|
|
//
|
|
// PRODUCER-CONSUMER: affine.for %[[idx_0:.*]] = 0 to 100 {
|
|
// PRODUCER-CONSUMER-NEXT: %[[result_0:.*]] = affine.load %[[arr1:.*]][%[[idx_0]]] : memref<100xf32>
|
|
// PRODUCER-CONSUMER-NEXT: affine.store %[[result_0]], %{{.*}}[%[[idx_0]]] : memref<100xf32>
|
|
// PRODUCER-CONSUMER-NEXT: }
|
|
// PRODUCER-CONSUMER: affine.for %[[idx_1:.*]] = 0 to 100 step 2 {
|
|
// PRODUCER-CONSUMER: affine.load %[[arr1]][%[[idx_1]]] : memref<100xf32>
|
|
// PRODUCER-CONSUMER: }
|
|
// PRODUCER-CONSUMER: return
|
|
|
|
// -----
|
|
|
|
// SIBLING-MAXIMAL-LABEL: func @reduce_add_non_maximal_f32_f32(
|
|
func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) {
|
|
%cst_0 = arith.constant 0.000000e+00 : f32
|
|
%cst_1 = arith.constant 1.000000e+00 : f32
|
|
// This nest writes to %arg1 but can be eliminated post sibling fusion.
|
|
affine.for %arg3 = 0 to 1 {
|
|
affine.for %arg4 = 0 to 64 {
|
|
%accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
|
|
%4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
|
|
%5 = arith.addf %prevAccum, %4 : f32
|
|
affine.yield %5 : f32
|
|
}
|
|
%accum_dbl = arith.addf %accum, %accum : f32
|
|
affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1>
|
|
}
|
|
}
|
|
affine.for %arg3 = 0 to 1 {
|
|
affine.for %arg4 = 0 to 64 {
|
|
// Following loop trip count does not match the corresponding source trip count.
|
|
%accum = affine.for %arg5 = 0 to 32 iter_args (%prevAccum = %cst_1) -> f32 {
|
|
%4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
|
|
%5 = arith.mulf %prevAccum, %4 : f32
|
|
affine.yield %5 : f32
|
|
}
|
|
%accum_sqr = arith.mulf %accum, %accum : f32
|
|
affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1>
|
|
}
|
|
}
|
|
return
|
|
}
|
|
// Test checks the loop structure is preserved after sibling fusion
|
|
// since the destination loop and source loop trip counts do not
|
|
// match.
|
|
// SIBLING-MAXIMAL: %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32
|
|
// SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
|
|
// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 1 {
|
|
// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 64 {
|
|
// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 32 iter_args(%{{.*}} = %[[cst_1]]) -> (f32) {
|
|
// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 64 iter_args(%{{.*}} = %[[cst_0]]) -> (f32) {
|
|
|
|
// -----
|
|
|
|
// SIBLING-MAXIMAL-LABEL: func @sibling_load_only
|
|
func.func @sibling_load_only(%arg0: memref<10xf32>) {
|
|
affine.for %arg1 = 0 to 10 {
|
|
%0 = affine.load %arg0[%arg1] : memref<10xf32>
|
|
}
|
|
affine.for %arg1 = 0 to 10 {
|
|
%0 = affine.load %arg0[%arg1] : memref<10xf32>
|
|
}
|
|
// SIBLING-MAXIMAL-NEXT: affine.for
|
|
// SIBLING-MAXIMAL-NEXT: affine.load
|
|
// SIBLING-MAXIMAL-NEXT: affine.load
|
|
return
|
|
}
|
|
|
|
// -----
|
|
|
|
// PRODUCER-CONSUMER-LABEL: func @fusion_for_multiple_blocks() {
|
|
func.func @fusion_for_multiple_blocks() {
|
|
^bb0:
|
|
%m = memref.alloc() : memref<10xf32>
|
|
%cf7 = arith.constant 7.0 : f32
|
|
|
|
affine.for %i0 = 0 to 10 {
|
|
affine.store %cf7, %m[%i0] : memref<10xf32>
|
|
}
|
|
affine.for %i1 = 0 to 10 {
|
|
%v0 = affine.load %m[%i1] : memref<10xf32>
|
|
}
|
|
// PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 10 {
|
|
// PRODUCER-CONSUMER-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
|
|
// PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
|
|
// PRODUCER-CONSUMER-NEXT: }
|
|
cf.br ^bb1
|
|
^bb1:
|
|
affine.for %i0 = 0 to 10 {
|
|
affine.store %cf7, %m[%i0] : memref<10xf32>
|
|
}
|
|
affine.for %i1 = 0 to 10 {
|
|
%v0 = affine.load %m[%i1] : memref<10xf32>
|
|
}
|
|
// PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 10 {
|
|
// PRODUCER-CONSUMER-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
|
|
// PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
|
|
// PRODUCER-CONSUMER-NEXT: }
|
|
return
|
|
}
|
|
|
|
// -----
|
|
|
|
// PRODUCER-CONSUMER-LABEL: @fuse_higher_dim_nest_into_lower_dim_nest
|
|
func.func @fuse_higher_dim_nest_into_lower_dim_nest() {
|
|
%A = memref.alloc() : memref<8x12x128x64xf32>
|
|
%B = memref.alloc() : memref<8x128x12x64xf32>
|
|
affine.for %arg205 = 0 to 8 {
|
|
affine.for %arg206 = 0 to 128 {
|
|
affine.for %arg207 = 0 to 12 {
|
|
affine.for %arg208 = 0 to 64 {
|
|
%a = affine.load %A[%arg205, %arg207, %arg206, %arg208] : memref<8x12x128x64xf32>
|
|
affine.store %a, %B[%arg205, %arg206, %arg207, %arg208] : memref<8x128x12x64xf32>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
%C = memref.alloc() : memref<8x128x768xf16>
|
|
affine.for %arg205 = 0 to 8 {
|
|
affine.for %arg206 = 0 to 128 {
|
|
affine.for %arg207 = 0 to 768 {
|
|
%b = affine.load %B[%arg205, %arg206, %arg207 floordiv 64, %arg207 mod 64] : memref<8x128x12x64xf32>
|
|
%c = arith.truncf %b : f32 to f16
|
|
affine.store %c, %C[%arg205, %arg206, %arg207] : memref<8x128x768xf16>
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check that fusion happens into the innermost loop of the consumer.
|
|
// PRODUCER-CONSUMER: affine.for
|
|
// PRODUCER-CONSUMER-NEXT: affine.for %{{.*}} = 0 to 128
|
|
// PRODUCER-CONSUMER-NEXT: affine.for %{{.*}} = 0 to 768
|
|
// PRODUCER-CONSUMER-NOT: affine.for
|
|
// PRODUCER-CONSUMER: return
|
|
return
|
|
}
|
|
|
|
// -----
|
|
|
|
// Basic test to ensure fusion works inside other func ops like spirv.func.
|
|
|
|
#map = affine_map<(d0, d1) -> (d0 + d1)>
|
|
module {
|
|
// SPIRV-LABEL: func @test_avgpool2d_pad_right
|
|
spirv.func @test_avgpool2d_pad_right(%arg0: !spirv.array<8192 x f32>) -> !spirv.array<8192 x f32> "None" {
|
|
%cst_f32 = spirv.Constant 0.000000e+00 : f32
|
|
%0 = builtin.unrealized_conversion_cast %arg0 : !spirv.array<8192 x f32> to tensor<1x32x32x8xf32>
|
|
%padded = tensor.pad %0 low[0, 4, 4, 0] high[0, 4, 8193, 0] {
|
|
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):
|
|
tensor.yield %cst_f32 : f32
|
|
} : tensor<1x32x32x8xf32> to tensor<1x40x8229x8xf32>
|
|
%1 = bufferization.to_buffer %padded : tensor<1x40x8229x8xf32> to memref<1x40x8229x8xf32>
|
|
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x32x32x8xf32>
|
|
affine.for %arg1 = 0 to 1 {
|
|
affine.for %arg2 = 0 to 32 {
|
|
affine.for %arg3 = 0 to 32 {
|
|
affine.for %arg4 = 0 to 8 {
|
|
affine.for %arg5 = 0 to 1 {
|
|
affine.for %arg6 = 0 to 1 {
|
|
%4 = affine.apply #map(%arg2, %arg5)
|
|
%5 = affine.apply #map(%arg3, %arg6)
|
|
%6 = affine.load %1[%arg1, %4, %5, %arg4] : memref<1x40x8229x8xf32>
|
|
%7 = affine.load %alloc_0[%arg1, %arg2, %arg3, %arg4] : memref<1x32x32x8xf32>
|
|
%8 = arith.addf %7, %6 : f32
|
|
affine.store %8, %alloc_0[%arg1, %arg2, %arg3, %arg4] : memref<1x32x32x8xf32>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
%alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x32x32x8xf32>
|
|
affine.for %arg1 = 0 to 1 {
|
|
affine.for %arg2 = 0 to 32 {
|
|
affine.for %arg3 = 0 to 32 {
|
|
affine.for %arg4 = 0 to 8 {
|
|
%4 = affine.load %alloc_0[%arg1, %arg2, %arg3, %arg4] : memref<1x32x32x8xf32>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Test fusion.
|
|
// SPIRV: affine.for %{{.*}} = 0 to 1 {
|
|
// SPIRV-NEXT: affine.for %{{.*}} = 0 to 32 {
|
|
// SPIRV-NEXT: affine.for %{{.*}} = 0 to 32 {
|
|
// SPIRV-NEXT: affine.for %{{.*}} = 0 to 8 {
|
|
// SPIRV-NOT: affine.for %{{.*}}
|
|
|
|
// SPIRV: ReturnValue
|
|
%2 = bufferization.to_tensor %alloc_1 : memref<1x32x32x8xf32> to tensor<1x32x32x8xf32>
|
|
%3 = builtin.unrealized_conversion_cast %2 : tensor<1x32x32x8xf32> to !spirv.array<8192 x f32>
|
|
spirv.ReturnValue %3 : !spirv.array<8192 x f32>
|
|
}
|
|
}
|
|
|
|
// -----
|
|
|
|
// PRODUCER-CONSUMER-LABEL: func @same_memref_load_store
|
|
func.func @same_memref_load_store(%producer : memref<32xf32>, %consumer: memref<16xf32>){
|
|
%cst = arith.constant 2.000000e+00 : f32
|
|
// Source isn't removed.
|
|
// PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 32
|
|
affine.for %arg3 = 0 to 32 {
|
|
%0 = affine.load %producer[%arg3] : memref<32xf32>
|
|
%2 = arith.mulf %0, %cst : f32
|
|
affine.store %2, %producer[%arg3] : memref<32xf32>
|
|
}
|
|
affine.for %arg3 = 0 to 16 {
|
|
%0 = affine.load %producer[%arg3] : memref<32xf32>
|
|
%2 = arith.addf %0, %cst : f32
|
|
affine.store %2, %consumer[%arg3] : memref<16xf32>
|
|
}
|
|
// Fused nest.
|
|
// PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 16
|
|
// PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<32xf32>
|
|
// PRODUCER-CONSUMER-NEXT: arith.mulf
|
|
// PRODUCER-CONSUMER-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
|
|
// PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
|
|
// PRODUCER-CONSUMER-NEXT: arith.addf
|
|
// PRODUCER-CONSUMER-NEXT: affine.store
|
|
// PRODUCER-CONSUMER-NEXT: }
|
|
return
|
|
}
|
|
|
|
// -----
|
|
|
|
// PRODUCER-CONSUMER-LABEL: func @same_memref_load_multiple_stores
|
|
// ALL-LABEL: func @same_memref_load_multiple_stores
|
|
func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %producer_2 : memref<32xf32>, %consumer: memref<16xf32>){
|
|
%cst = arith.constant 2.000000e+00 : f32
|
|
// Ensure that source isn't removed during both producer-consumer fusion and
|
|
// sibling fusion.
|
|
// PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 32
|
|
// ALL: affine.for %{{.*}} = 0 to 32
|
|
affine.for %arg3 = 0 to 32 {
|
|
%0 = affine.load %producer[%arg3] : memref<32xf32>
|
|
%2 = arith.mulf %0, %cst : f32
|
|
affine.store %2, %producer[%arg3] : memref<32xf32>
|
|
affine.store %2, %producer_2[%arg3] : memref<32xf32>
|
|
}
|
|
affine.for %arg3 = 0 to 16 {
|
|
%0 = affine.load %producer[%arg3] : memref<32xf32>
|
|
%1 = affine.load %producer_2[%arg3] : memref<32xf32>
|
|
%2 = arith.addf %0, %1 : f32
|
|
affine.store %2, %consumer[%arg3] : memref<16xf32>
|
|
}
|
|
// Fused nest.
|
|
// PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 16
|
|
// PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<32xf32>
|
|
// PRODUCER-CONSUMER-NEXT: arith.mulf
|
|
// PRODUCER-CONSUMER-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
|
|
// PRODUCER-CONSUMER-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
|
|
// PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
|
|
// PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
|
|
// PRODUCER-CONSUMER-NEXT: arith.addf
|
|
// PRODUCER-CONSUMER-NEXT: affine.store
|
|
// PRODUCER-CONSUMER-NEXT: }
|
|
// ALL: affine.for %{{.*}} = 0 to 16
|
|
// ALL: mulf
|
|
// ALL: addf
|
|
return
|
|
}
|
|
|
|
// -----
|
|
|
|
#map = affine_map<()[s0] -> (s0 + 5)>
|
|
#map1 = affine_map<()[s0] -> (s0 + 17)>
|
|
|
|
// Test with non-int/float memref types.
|
|
|
|
// PRODUCER-CONSUMER-MAXIMAL-LABEL: func @memref_index_type
|
|
func.func @memref_index_type() {
|
|
%0 = llvm.mlir.constant(2 : index) : i64
|
|
%2 = llvm.mlir.constant(0 : index) : i64
|
|
%3 = builtin.unrealized_conversion_cast %2 : i64 to index
|
|
%alloc = memref.alloc() {alignment = 64 : i64} : memref<8x18xf32>
|
|
%alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<3xf32>
|
|
%alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<3xindex>
|
|
affine.for %arg3 = 0 to 3 {
|
|
%4 = affine.load %alloc_2[%arg3] : memref<3xindex>
|
|
%5 = builtin.unrealized_conversion_cast %4 : index to i64
|
|
%6 = llvm.sub %0, %5 : i64
|
|
%7 = builtin.unrealized_conversion_cast %6 : i64 to index
|
|
affine.store %7, %alloc_2[%arg3] : memref<3xindex>
|
|
}
|
|
affine.for %arg3 = 0 to 3 {
|
|
%4 = affine.load %alloc_2[%arg3] : memref<3xindex>
|
|
%5 = affine.apply #map()[%4]
|
|
%6 = affine.apply #map1()[%3]
|
|
%7 = memref.load %alloc[%5, %6] : memref<8x18xf32>
|
|
affine.store %7, %alloc_1[%arg3] : memref<3xf32>
|
|
}
|
|
// Expect fusion.
|
|
// PRODUCER-CONSUMER-MAXIMAL: affine.for
|
|
// PRODUCER-CONSUMER-MAXIMAL-NOT: affine.for
|
|
// PRODUCER-CONSUMER-MAXIMAL: return
|
|
return
|
|
}
|
|
|
|
// -----
|
|
|
|
#map = affine_map<(d0) -> (d0)>
|
|
#map1 =affine_map<(d0) -> (d0 + 1)>
|
|
|
|
// Test non-integer memory spaces.
|
|
|
|
// PRODUCER-CONSUMER-LABEL: func @non_int_memory_space
|
|
func.func @non_int_memory_space() {
|
|
%alloc = memref.alloc() : memref<256x8xf32, #spirv.storage_class<StorageBuffer>>
|
|
affine.for %arg0 = 0 to 64 {
|
|
affine.for %arg1 = 0 to 8 {
|
|
%0 = affine.apply #map(%arg1)
|
|
%1 = affine.load %alloc[%arg0, %0] : memref<256x8xf32, #spirv.storage_class<StorageBuffer>>
|
|
affine.store %1, %alloc[%arg0, %arg1] : memref<256x8xf32, #spirv.storage_class<StorageBuffer>>
|
|
}
|
|
}
|
|
affine.for %arg0 = 16 to 32 {
|
|
affine.for %arg1 = 0 to 8 {
|
|
%0 = affine.apply #map(%arg1)
|
|
%1 = affine.load %alloc[%arg0, %0] : memref<256x8xf32, #spirv.storage_class<StorageBuffer>>
|
|
affine.store %1, %alloc[%arg0, %arg1] : memref<256x8xf32, #spirv.storage_class<StorageBuffer>>
|
|
}
|
|
}
|
|
// Fused nest.
|
|
// PRODUCER-CONSUMER-NEXT: memref.alloc()
|
|
// PRODUCER-CONSUMER-NEXT: memref.alloc()
|
|
// PRODUCER-CONSUMER-NEXT: affine.for %{{.*}} = 16 to 32
|
|
// PRODUCER-CONSUMER-NEXT: affine.for %{{.*}} = 0 to 8
|
|
return
|
|
}
|
|
|
|
// -----
|
|
|
|
#map = affine_map<(d0) -> (d0)>
|
|
#map1 = affine_map<(d0) -> (d0 + 1)>
|
|
|
|
// Exercises fix for crash reported at https://github.com/llvm/llvm-project/issues/119525
|
|
|
|
// No fusion of producer into consumer happens here as the slice is determined
|
|
// to be invalid. This is a limitation and it is possible to compute a slice
|
|
// (reduction along %arg4) and fuse.
|
|
|
|
// PRODUCER-CONSUMER-LABEL: func @slice_compute_check
|
|
func.func @slice_compute_check(%arg0: memref<1x8x26xi32, strided<[?, ?, ?], offset: ?>>, %arg1: memref<1x8x26xi32, strided<[?, ?, ?], offset: ?>>, %arg2: memref<1x8x26xi32, strided<[?, ?, ?], offset: ?>>) {
|
|
%alloc_14 = memref.alloc() : memref<1x8x26xi32>
|
|
%alloc_15 = memref.alloc() : memref<1x26xi32>
|
|
affine.for %arg3 = 0 to 1 {
|
|
affine.for %arg4 = 0 to 8 {
|
|
affine.for %arg5 = 0 to 26 {
|
|
affine.for %arg6 = #map(%arg3) to #map1(%arg3) {
|
|
affine.for %arg7 = #map(%arg4) to #map1(%arg4) {
|
|
affine.for %arg8 = #map(%arg5) to #map1(%arg5) {
|
|
%61 = affine.load %alloc_14[%arg6, %arg7, %arg8] : memref<1x8x26xi32>
|
|
%62 = affine.load %alloc_15[%arg6, %arg8] : memref<1x26xi32>
|
|
%63 = llvm.intr.smin(%61, %62) : (i32, i32) -> i32
|
|
affine.store %63, %alloc_15[%arg6, %arg8] : memref<1x26xi32>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
affine.for %arg3 = 0 to 26 {
|
|
%61 = affine.load %alloc_15[0, %arg3] : memref<1x26xi32>
|
|
}
|
|
memref.dealloc %alloc_15 : memref<1x26xi32>
|
|
memref.dealloc %alloc_14 : memref<1x8x26xi32>
|
|
return
|
|
}
|
|
|
|
// -----
|
|
|
|
// Exercises fix for crash reported at https://github.com/llvm/llvm-project/issues/108374
|
|
|
|
// No fusion of producer into consumer happens here. The slice will not be
|
|
// valid as the producer doesn't supply to all of the consumer.
|
|
|
|
#map = affine_map<(d0) -> (d0)>
|
|
#map1 = affine_map<(d0) -> (d0 + 1)>
|
|
// PRODUCER-CONSUMER-LABEL: func @test_add_slice_bounds
|
|
func.func @test_add_slice_bounds() {
|
|
%alloc = memref.alloc() : memref<10xf32>
|
|
%cst = arith.constant 0.619152 : f32
|
|
affine.for %arg0 = 0 to 10 {
|
|
affine.for %arg1 = #map(%arg0) to #map1(%arg0) {
|
|
affine.store %cst, %alloc[%arg1] : memref<10xf32>
|
|
}
|
|
}
|
|
affine.for %arg0 = 0 to 3 {
|
|
affine.for %arg1 = 0 to 10 {
|
|
affine.for %arg2 = #map(%arg0) to #map1(%arg0) {
|
|
affine.for %arg3 = #map(%arg1) to #map1(%arg1) {
|
|
%0 = affine.apply #map1(%arg3)
|
|
%1 = affine.load %alloc[%0] : memref<10xf32>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// PRODUCER-CONSUMER-MAXIMAL-LABEL: func @producer_reduction_no_fusion
|
|
func.func @producer_reduction_no_fusion(%input : memref<10xf32>, %output : memref<10xf32>, %reduc : memref<1xf32>) {
|
|
%zero = arith.constant 0. : f32
|
|
%one = arith.constant 1. : f32
|
|
// This producer can't be fused into inside %i without a violation of
|
|
// semantics.
|
|
// PRODUCER-CONSUMER-MAXIMAL: affine.for %{{.*}} = 0 to 10
|
|
affine.for %i = 0 to 10 {
|
|
%0 = affine.load %input[%i] : memref<10xf32>
|
|
%1 = affine.load %reduc[0] : memref<1xf32>
|
|
%2 = arith.addf %0, %1 : f32
|
|
affine.store %2, %reduc[0] : memref<1xf32>
|
|
}
|
|
// PRODUCER-CONSUMER-MAXIMAL: affine.for %{{.*}} = 0 to 10
|
|
affine.for %i = 0 to 10 {
|
|
%0 = affine.load %reduc[0] : memref<1xf32>
|
|
%2 = arith.addf %0, %one : f32
|
|
affine.store %2, %output[%i] : memref<10xf32>
|
|
}
|
|
return
|
|
}
|
|
|
|
// SIBLING-MAXIMAL-LABEL: func @sibling_reduction
|
|
func.func @sibling_reduction(%input : memref<10xf32>, %output : memref<10xf32>, %reduc : memref<10xf32>) {
|
|
%zero = arith.constant 0. : f32
|
|
%one = arith.constant 1. : f32
|
|
affine.for %i = 0 to 10 {
|
|
%0 = affine.load %input[%i] : memref<10xf32>
|
|
%2 = arith.addf %0, %one : f32
|
|
affine.store %2, %output[%i] : memref<10xf32>
|
|
}
|
|
// Ensure that the fusion happens at the right depth.
|
|
affine.for %i = 0 to 10 {
|
|
%0 = affine.load %input[%i] : memref<10xf32>
|
|
%1 = affine.load %reduc[0] : memref<10xf32>
|
|
%2 = arith.addf %0, %1 : f32
|
|
affine.store %2, %reduc[0] : memref<10xf32>
|
|
}
|
|
// SIBLING-MAXIMAL: affine.for %{{.*}} = 0 to 10
|
|
// SIBLING-MAXIMAL-NEXT: affine.load
|
|
// SIBLING-MAXIMAL-NEXT: addf
|
|
// SIBLING-MAXIMAL-NEXT: affine.store
|
|
// SIBLING-MAXIMAL-NEXT: affine.load
|
|
// SIBLING-MAXIMAL-NEXT: affine.load
|
|
// SIBLING-MAXIMAL-NEXT: addf
|
|
// SIBLING-MAXIMAL-NEXT: affine.store
|
|
return
|
|
}
|
|
|
|
// -----
|
|
|
|
// From https://github.com/llvm/llvm-project/issues/54541
|
|
|
|
#map = affine_map<(d0) -> (d0 mod 65536)>
|
|
// ZERO-TOLERANCE-LABEL: func @zero_tolerance
|
|
func.func @zero_tolerance(%arg0: memref<65536xcomplex<f64>>, %arg1: memref<30x131072xi64>,
|
|
%3 : memref<30xi64>,
|
|
%4 : memref<30xi64>,
|
|
%5 : memref<30xi64>,
|
|
%6 : memref<30xi64>
|
|
) {
|
|
%c65536 = arith.constant 65536 : index
|
|
%cst = arith.constant 0.000000e+00 : f64
|
|
%cst_0 = arith.constant 0x4320000000380004 : f64
|
|
%cst_1 = arith.constant 5.000000e-01 : f64
|
|
%0 = memref.alloc() {alignment = 128 : i64} : memref<30x131072xi64>
|
|
%1 = memref.alloc() {alignment = 128 : i64} : memref<131072xi1>
|
|
%2 = memref.alloc() {alignment = 128 : i64} : memref<131072xi128>
|
|
// This nest nest shouldn't be fused in when a zero tolerance is specified.
|
|
// ZERO-TOLERANCE: affine.for %{{.*}} = 0 to 131072
|
|
affine.for %arg2 = 0 to 131072 {
|
|
%7 = affine.apply #map(%arg2)
|
|
%8 = affine.load %arg0[%7] : memref<65536xcomplex<f64>>
|
|
%9 = arith.cmpi ult, %arg2, %c65536 : index
|
|
%10 = complex.im %8 : complex<f64>
|
|
%11 = complex.re %8 : complex<f64>
|
|
%12 = arith.select %9, %11, %10 : f64
|
|
%13 = arith.cmpf olt, %12, %cst : f64
|
|
%14 = arith.negf %12 : f64
|
|
%15 = arith.select %13, %14, %12 : f64
|
|
%16 = arith.mulf %15, %cst_0 : f64
|
|
%17 = arith.addf %16, %cst_1 : f64
|
|
%18 = arith.fptosi %17 : f64 to i128
|
|
affine.store %18, %2[%arg2] : memref<131072xi128>
|
|
affine.store %13, %1[%arg2] : memref<131072xi1>
|
|
}
|
|
// The next two nests are fused.
|
|
// ZERO-TOLERANCE: affine.for %{{.*}} = 0 to 30
|
|
// ZERO-TOLERANCE-NEXT: affine.for %{{.*}} = 0 to 131072
|
|
// ZERO-TOLERANCE: func.call @__external_reduce_barrett
|
|
// ZERO-TOLERANCE: affine.store
|
|
// ZERO-TOLERANCE: affine.load
|
|
// ZERO-TOLERANCE-NEXT: affine.store
|
|
affine.for %arg2 = 0 to 30 {
|
|
affine.for %arg3 = 0 to 131072 {
|
|
%7 = affine.load %6[%arg2] : memref<30xi64>
|
|
%8 = affine.load %3[%arg2] : memref<30xi64>
|
|
%9 = affine.load %5[%arg2] : memref<30xi64>
|
|
%10 = affine.load %4[%arg2] : memref<30xi64>
|
|
%11 = affine.load %2[%arg3] : memref<131072xi128>
|
|
%12 = affine.load %1[%arg3] : memref<131072xi1>
|
|
%13 = func.call @__external_reduce_barrett(%7, %8, %9, %10, %11) {outputModFac = 1 : i64} : (i64, i64, i64, i64, i128) -> i64
|
|
%14 = arith.subi %7, %13 : i64
|
|
%15 = arith.select %12, %14, %13 : i64
|
|
affine.store %15, %0[%arg2, %arg3] : memref<30x131072xi64>
|
|
}
|
|
}
|
|
func.call @__external_levelwise_forward_ntt(%0) : (memref<30x131072xi64>) -> ()
|
|
affine.for %arg2 = 0 to 30 {
|
|
affine.for %arg3 = 0 to 131072 {
|
|
%7 = affine.load %0[%arg2, %arg3] : memref<30x131072xi64>
|
|
affine.store %7, %arg1[%arg2, %arg3] : memref<30x131072xi64>
|
|
}
|
|
}
|
|
// Under maximal fusion, just one nest.
|
|
// PRODUCER-CONSUMER-MAXIMAL: affine.for %{{.*}} = 0 to 30
|
|
// PRODUCER-CONSUMER-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 131072
|
|
// PRODUCER-CONSUMER-MAXIMAL-NOT: affine.for %{{.*}}
|
|
memref.dealloc %2 : memref<131072xi128>
|
|
memref.dealloc %1 : memref<131072xi1>
|
|
memref.dealloc %0 : memref<30x131072xi64>
|
|
return
|
|
}
|
|
func.func private @__external_levelwise_forward_ntt(memref<30x131072xi64>)
|
|
func.func private @__external_reduce_barrett(i64, i64, i64, i64, i128) -> i64
|
|
|
|
// An unrolled loop nest. Fusion here should correctly fuse while preserving
|
|
// dependences between store-load pairs of the same memref. A private memref
|
|
// of size 1x1x1 can't be created.
|
|
|
|
// PRODUCER-CONSUMER-MAXIMAL-LABEL: func @unrolled
|
|
func.func @unrolled(%arg0: memref<2x4xf32>, %arg1: memref<1x2x4xf32>) {
|
|
%alloc = memref.alloc() : memref<1x2x4xf32>
|
|
affine.for %i = 0 to 1 {
|
|
%0 = affine.load %arg0[0, 0] : memref<2x4xf32>
|
|
%1 = affine.load %arg0[0, 1] : memref<2x4xf32>
|
|
%2 = affine.load %arg0[0, 2] : memref<2x4xf32>
|
|
%3 = affine.load %arg0[0, 3] : memref<2x4xf32>
|
|
%4 = affine.load %arg0[1, 0] : memref<2x4xf32>
|
|
%5 = affine.load %arg0[1, 1] : memref<2x4xf32>
|
|
%6 = affine.load %arg0[1, 2] : memref<2x4xf32>
|
|
%7 = affine.load %arg0[1, 3] : memref<2x4xf32>
|
|
|
|
affine.store %0, %alloc[0, 0, 0] : memref<1x2x4xf32>
|
|
affine.store %1, %alloc[0, 0, 1] : memref<1x2x4xf32>
|
|
affine.store %2, %alloc[0, 0, 2] : memref<1x2x4xf32>
|
|
affine.store %3, %alloc[0, 0, 3] : memref<1x2x4xf32>
|
|
affine.store %4, %alloc[0, 1, 0] : memref<1x2x4xf32>
|
|
affine.store %5, %alloc[0, 1, 1] : memref<1x2x4xf32>
|
|
affine.store %6, %alloc[0, 1, 2] : memref<1x2x4xf32>
|
|
affine.store %7, %alloc[0, 1, 3] : memref<1x2x4xf32>
|
|
}
|
|
|
|
affine.for %i = 0 to 2 {
|
|
affine.for %j = 0 to 4 {
|
|
%8 = affine.load %alloc[0, %i, %j] : memref<1x2x4xf32>
|
|
%9 = arith.negf %8 : f32
|
|
affine.store %9, %arg1[0, %i, %j] : memref<1x2x4xf32>
|
|
}
|
|
}
|
|
// PRODUCER-CONSUMER-MAXIMAL: affine.for %{{.*}} = 0 to 2 {
|
|
// PRODUCER-CONSUMER-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 4 {
|
|
// PRODUCER-CONSUMER-MAXIMAL-NEXT: affine.load %{{.*}}[0, 0]
|
|
// PRODUCER-CONSUMER-MAXIMAL: affine.load %{{.*}}[1, 3]
|
|
// PRODUCER-CONSUMER-MAXIMAL: affine.store %{{.*}}[0, 0, 0]
|
|
// PRODUCER-CONSUMER-MAXIMAL: affine.store %{{.*}}[0, 1, 3]
|
|
// PRODUCER-CONSUMER-MAXIMAL: affine.load %{{.*}}[0, %{{.*}}, %{{.*}}]
|
|
return
|
|
}
|