This specifically handles the case of a transpose from a vector type
like `vector<8x[4]xf32>` to `vector<[4]x8xf32>`. Such transposes occur
fairly frequently when scalably vectorizing `linalg.generic`s. There is
no direct lowering for these (as types like `vector<[4]x8xf32>` cannot
be represented in LLVM-IR). However, if the only use of the transpose is
a write, then it is possible to lower the `transfer_write(transpose)` as
a VLA loop.
Example:
```mlir
%transpose = vector.transpose %vec, [1, 0]
: vector<4x[4]xf32> to vector<[4]x4xf32>
vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]}
: vector<[4]x4xf32>, memref<?x?xf32>
```
Becomes:
```mlir
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = vector.extract %arg0[0] : vector<[4]xf32> from vector<4x[4]xf32>
%1 = vector.extract %arg0[1] : vector<[4]xf32> from vector<4x[4]xf32>
%2 = vector.extract %arg0[2] : vector<[4]xf32> from vector<4x[4]xf32>
%3 = vector.extract %arg0[3] : vector<[4]xf32> from vector<4x[4]xf32>
%vscale = vector.vscale
%c4_vscale = arith.muli %vscale, %c4 : index
scf.for %idx = %c0 to %c4_vscale step %c1 {
%4 = vector.extract %0[%idx] : f32 from vector<[4]xf32>
%5 = vector.extract %1[%idx] : f32 from vector<[4]xf32>
%6 = vector.extract %2[%idx] : f32 from vector<[4]xf32>
%7 = vector.extract %3[%idx] : f32 from vector<[4]xf32>
%slice_i = affine.apply #map(%idx)[%i]
%slice = vector.from_elements %4, %5, %6, %7 : vector<4xf32>
vector.transfer_write %slice, %arg1[%slice_i, %j] {in_bounds = [true]}
: vector<4xf32>, memref<?x?xf32>
}
```
52 lines
2.9 KiB
MLIR
52 lines
2.9 KiB
MLIR
// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{lower-tensors=true lower-scalable=true}))" -split-input-file -allow-unregistered-dialect | FileCheck %s
|
|
|
|
// CHECK-LABEL: func @transfer_read_2d(
|
|
// CHECK: %[[ALLOC:.*]] = memref.alloca() : memref<vector<4x9xf32>>
|
|
// CHECK: %[[CASTED:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<4x9xf32>> to memref<4xvector<9xf32>>
|
|
// CHECK: scf.for {{.*}} {
|
|
// CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %cst {in_bounds = [true]} : tensor<?x?xf32>, vector<9xf32>
|
|
// CHECK: memref.store %[[READ]], %[[CASTED]][%{{.*}}] : memref<4xvector<9xf32>>
|
|
// CHECK: }
|
|
// CHECK: %[[LOADED:.*]] = memref.load %[[ALLOC]][] : memref<vector<4x9xf32>>
|
|
// CHECK: return %[[LOADED]] : vector<4x9xf32>
|
|
func.func @transfer_read_2d(%A : tensor<?x?xf32>, %base1 : index, %base2 : index)
|
|
-> (vector<4x9xf32>){
|
|
%p = arith.constant -42.0: f32
|
|
%f = vector.transfer_read %A[%base1, %base2], %p {in_bounds = [true, true]}
|
|
: tensor<?x?xf32>, vector<4x9xf32>
|
|
return %f : vector<4x9xf32>
|
|
}
|
|
|
|
// -----
|
|
|
|
// CHECK-LABEL: func @transfer_write_2d(
|
|
// CHECK: %[[ALLOC:.*]] = memref.alloca() : memref<vector<2x3xf32>>
|
|
// CHECK: memref.store {{.*}}, %[[ALLOC]][] : memref<vector<2x3xf32>>
|
|
// CHECK: %[[CASTED:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<2x3xf32>> to memref<2xvector<3xf32>>
|
|
// CHECK: %[[RESULT:.*]] = scf.for {{.*}} iter_args(%[[STATE:.*]] = %{{.*}}) -> (tensor<?x?xf32>) {
|
|
// CHECK: %[[LOADED:.*]] = memref.load %[[CASTED]][%{{.*}}] : memref<2xvector<3xf32>>
|
|
// CHECK: %[[WRITE:.*]] = vector.transfer_write %[[LOADED]], %[[STATE]][{{.*}}] {in_bounds = [true]} : vector<3xf32>, tensor<?x?xf32>
|
|
// CHECK: scf.yield %[[WRITE]] : tensor<?x?xf32>
|
|
// CHECK: }
|
|
// CHECK: return %[[RESULT]] : tensor<?x?xf32>
|
|
func.func @transfer_write_2d(%A : tensor<?x?xf32>, %vec : vector<2x3xf32>,
|
|
%base1 : index, %base2 : index) -> (tensor<?x?xf32>) {
|
|
%t = vector.transfer_write %vec, %A[%base1, %base2] {in_bounds = [true, true]}
|
|
: vector<2x3xf32>, tensor<?x?xf32>
|
|
return %t : tensor<?x?xf32>
|
|
}
|
|
|
|
// -----
|
|
|
|
// CHECK-LABEL: func @scalable_transpose_store
|
|
// CHECK-SAME: %[[TENSOR:[a-z0-9]+]]: tensor<?x?xf32>
|
|
// CHECK: %[[RESULT:.*]] = scf.for {{.*}} iter_args(%[[ITER_ARG:.*]] = %[[TENSOR]]) -> (tensor<?x?xf32>)
|
|
// CHECK: %[[WRITE_SLICE:.*]] = vector.transfer_write %{{.*}} %[[ITER_ARG]]
|
|
// CHECK: scf.yield %[[WRITE_SLICE]]
|
|
// CHECK: return %[[RESULT]]
|
|
func.func @scalable_transpose_store(%vec: vector<4x[4]xf32>, %A: tensor<?x?xf32>, %base1: index, %base2: index) -> tensor<?x?xf32> {
|
|
%transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
|
|
%result = vector.transfer_write %transpose, %A[%base1, %base2] {in_bounds = [true, true]} : vector<[4]x4xf32>, tensor<?x?xf32>
|
|
return %result : tensor<?x?xf32>
|
|
}
|