The logic enabling the Arm SVE (and now SME) integration tests for various dialects, that may run under emulation, is now duplicated in several places. This patch moves the configuration to the top-level MLIR integration tests Lit config and renames the '%lli' substitution in contexts where it will run exclusively (ArmSVE, ArmSME) on AArch64 (and possibly under emulation) to '%lli_aarch64_cmd', and '%lli_host_or_aarch64_cmd' for contexts where it may run AArch64 (also possibly under emulation). The latter is for integration tests that have target-specific and target-agnostic codepaths such as SparseTensor, which supports scalable vectors. The two substitutions have the same effect but the names are different to convey this information. The '%lli_aarch64_cmd' substitution could be used in the SparseTensor tests but that would be a misnomer if the host were x86 and the MLIR_RUN_SVE_TESTS=OFF. The reason for renaming the '%lli' substitution is to not prevent running other target-specific integration tests at the same time, since the same substitution '%lli' is used for lli in other integration tests: * mlir/test/Integration/Dialect/Vector/CPU/X86Vector - (AVX emulation via Intel SDE) * mlir/test/Integration/Dialect/Vector/CPU/AMX - (AMX emulation via Intel SDE) * mlir/test/Integration/Dialect/LLVMIR/CPU/test-vp-intrinsic.mlir - (RISCV emulation via QEMU if supported, native otherwise) and substituting '%lli' at the top-level with Arm specific logic would override this. Reviewed By: awarzynski Differential Revision: https://reviews.llvm.org/D148929
294 lines
12 KiB
MLIR
294 lines
12 KiB
MLIR
// DEFINE: %{option} = enable-runtime-library=true
|
|
// DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
|
|
// DEFINE: %{run} = mlir-cpu-runner \
|
|
// DEFINE: -e entry -entry-point-result=void \
|
|
// DEFINE: -shared-libs=%mlir_c_runner_utils | \
|
|
// DEFINE: FileCheck %s
|
|
//
|
|
// RUN: %{compile} | %{run}
|
|
//
|
|
// Do the same run, but now with direct IR generation.
|
|
// REDEFINE: %{option} = enable-runtime-library=false
|
|
// RUN: %{compile} | %{run}
|
|
//
|
|
// Do the same run, but now with direct IR generation and vectorization.
|
|
// REDEFINE: %{option} = "enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true"
|
|
// RUN: %{compile} | %{run}
|
|
|
|
// Do the same run, but now with direct IR generation and, if available, VLA
|
|
// vectorization.
|
|
// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA"
|
|
// REDEFINE: %{run} = %lli_host_or_aarch64_cmd \
|
|
// REDEFINE: --entry-function=entry_lli \
|
|
// REDEFINE: --extra-module=%S/Inputs/main_for_lli.ll \
|
|
// REDEFINE: %VLA_ARCH_ATTR_OPTIONS \
|
|
// REDEFINE: --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \
|
|
// REDEFINE: FileCheck %s
|
|
// RUN: %{compile} | mlir-translate -mlir-to-llvmir | %{run}
|
|
|
|
#SparseVector = #sparse_tensor.encoding<{
|
|
dimLevelType = ["compressed"]
|
|
}>
|
|
|
|
#SparseMatrix = #sparse_tensor.encoding<{
|
|
dimLevelType = ["compressed", "compressed"]
|
|
}>
|
|
|
|
#trait_1d = {
|
|
indexing_maps = [
|
|
affine_map<(i) -> (i)>, // a
|
|
affine_map<(i) -> (i)> // x (out)
|
|
],
|
|
iterator_types = ["parallel"],
|
|
doc = "X(i) = a(i) op i"
|
|
}
|
|
|
|
#trait_2d = {
|
|
indexing_maps = [
|
|
affine_map<(i,j) -> (i,j)>, // A
|
|
affine_map<(i,j) -> (i,j)> // X (out)
|
|
],
|
|
iterator_types = ["parallel", "parallel"],
|
|
doc = "X(i,j) = A(i,j) op i op j"
|
|
}
|
|
|
|
//
|
|
// Test with indices. Note that a lot of results are actually
|
|
// dense, but this is done to stress test all the operations.
|
|
//
|
|
module {
|
|
|
|
//
|
|
// Kernel that uses index in the index notation (conjunction).
|
|
//
|
|
func.func @sparse_index_1d_conj(%arga: tensor<8xi64, #SparseVector>)
|
|
-> tensor<8xi64, #SparseVector> {
|
|
%init = bufferization.alloc_tensor() : tensor<8xi64, #SparseVector>
|
|
%r = linalg.generic #trait_1d
|
|
ins(%arga: tensor<8xi64, #SparseVector>)
|
|
outs(%init: tensor<8xi64, #SparseVector>) {
|
|
^bb(%a: i64, %x: i64):
|
|
%i = linalg.index 0 : index
|
|
%ii = arith.index_cast %i : index to i64
|
|
%m1 = arith.muli %a, %ii : i64
|
|
linalg.yield %m1 : i64
|
|
} -> tensor<8xi64, #SparseVector>
|
|
return %r : tensor<8xi64, #SparseVector>
|
|
}
|
|
|
|
//
|
|
// Kernel that uses index in the index notation (disjunction).
|
|
//
|
|
func.func @sparse_index_1d_disj(%arga: tensor<8xi64, #SparseVector>)
|
|
-> tensor<8xi64, #SparseVector> {
|
|
%init = bufferization.alloc_tensor() : tensor<8xi64, #SparseVector>
|
|
%r = linalg.generic #trait_1d
|
|
ins(%arga: tensor<8xi64, #SparseVector>)
|
|
outs(%init: tensor<8xi64, #SparseVector>) {
|
|
^bb(%a: i64, %x: i64):
|
|
%i = linalg.index 0 : index
|
|
%ii = arith.index_cast %i : index to i64
|
|
%m1 = arith.addi %a, %ii : i64
|
|
linalg.yield %m1 : i64
|
|
} -> tensor<8xi64, #SparseVector>
|
|
return %r : tensor<8xi64, #SparseVector>
|
|
}
|
|
|
|
//
|
|
// Kernel that uses indices in the index notation (conjunction).
|
|
//
|
|
func.func @sparse_index_2d_conj(%arga: tensor<3x4xi64, #SparseMatrix>)
|
|
-> tensor<3x4xi64, #SparseMatrix> {
|
|
%init = bufferization.alloc_tensor() : tensor<3x4xi64, #SparseMatrix>
|
|
%r = linalg.generic #trait_2d
|
|
ins(%arga: tensor<3x4xi64, #SparseMatrix>)
|
|
outs(%init: tensor<3x4xi64, #SparseMatrix>) {
|
|
^bb(%a: i64, %x: i64):
|
|
%i = linalg.index 0 : index
|
|
%j = linalg.index 1 : index
|
|
%ii = arith.index_cast %i : index to i64
|
|
%jj = arith.index_cast %j : index to i64
|
|
%m1 = arith.muli %ii, %a : i64
|
|
%m2 = arith.muli %jj, %m1 : i64
|
|
linalg.yield %m2 : i64
|
|
} -> tensor<3x4xi64, #SparseMatrix>
|
|
return %r : tensor<3x4xi64, #SparseMatrix>
|
|
}
|
|
|
|
//
|
|
// Kernel that uses indices in the index notation (disjunction).
|
|
//
|
|
func.func @sparse_index_2d_disj(%arga: tensor<3x4xi64, #SparseMatrix>)
|
|
-> tensor<3x4xi64, #SparseMatrix> {
|
|
%init = bufferization.alloc_tensor() : tensor<3x4xi64, #SparseMatrix>
|
|
%r = linalg.generic #trait_2d
|
|
ins(%arga: tensor<3x4xi64, #SparseMatrix>)
|
|
outs(%init: tensor<3x4xi64, #SparseMatrix>) {
|
|
^bb(%a: i64, %x: i64):
|
|
%i = linalg.index 0 : index
|
|
%j = linalg.index 1 : index
|
|
%ii = arith.index_cast %i : index to i64
|
|
%jj = arith.index_cast %j : index to i64
|
|
%m1 = arith.addi %ii, %a : i64
|
|
%m2 = arith.addi %jj, %m1 : i64
|
|
linalg.yield %m2 : i64
|
|
} -> tensor<3x4xi64, #SparseMatrix>
|
|
return %r : tensor<3x4xi64, #SparseMatrix>
|
|
}
|
|
|
|
func.func @add_outer_2d(%arg0: tensor<2x3xf32, #SparseMatrix>)
|
|
-> tensor<2x3xf32, #SparseMatrix> {
|
|
%0 = bufferization.alloc_tensor() : tensor<2x3xf32, #SparseMatrix>
|
|
%1 = linalg.generic #trait_2d
|
|
ins(%arg0 : tensor<2x3xf32, #SparseMatrix>)
|
|
outs(%0 : tensor<2x3xf32, #SparseMatrix>) {
|
|
^bb0(%arg1: f32, %arg2: f32):
|
|
%2 = linalg.index 0 : index
|
|
%3 = arith.index_cast %2 : index to i64
|
|
%4 = arith.uitofp %3 : i64 to f32
|
|
%5 = arith.addf %arg1, %4 : f32
|
|
linalg.yield %5 : f32
|
|
} -> tensor<2x3xf32, #SparseMatrix>
|
|
return %1 : tensor<2x3xf32, #SparseMatrix>
|
|
}
|
|
|
|
//
|
|
// Main driver.
|
|
//
|
|
func.func @entry() {
|
|
%c0 = arith.constant 0 : index
|
|
%du = arith.constant -1 : i64
|
|
%df = arith.constant -1.0 : f32
|
|
|
|
// Setup input sparse vector.
|
|
%v1 = arith.constant sparse<[[2], [4]], [ 10, 20]> : tensor<8xi64>
|
|
%sv = sparse_tensor.convert %v1 : tensor<8xi64> to tensor<8xi64, #SparseVector>
|
|
|
|
// Setup input "sparse" vector.
|
|
%v2 = arith.constant dense<[ 1, 2, 4, 8, 16, 32, 64, 128 ]> : tensor<8xi64>
|
|
%dv = sparse_tensor.convert %v2 : tensor<8xi64> to tensor<8xi64, #SparseVector>
|
|
|
|
// Setup input sparse matrix.
|
|
%m1 = arith.constant sparse<[[1,1], [2,3]], [10, 20]> : tensor<3x4xi64>
|
|
%sm = sparse_tensor.convert %m1 : tensor<3x4xi64> to tensor<3x4xi64, #SparseMatrix>
|
|
|
|
// Setup input "sparse" matrix.
|
|
%m2 = arith.constant dense <[ [ 1, 1, 1, 1 ],
|
|
[ 1, 2, 1, 1 ],
|
|
[ 1, 1, 3, 4 ] ]> : tensor<3x4xi64>
|
|
%dm = sparse_tensor.convert %m2 : tensor<3x4xi64> to tensor<3x4xi64, #SparseMatrix>
|
|
|
|
// Setup input sparse f32 matrix.
|
|
%mf32 = arith.constant sparse<[[0,1], [1,2]], [10.0, 41.0]> : tensor<2x3xf32>
|
|
%sf32 = sparse_tensor.convert %mf32 : tensor<2x3xf32> to tensor<2x3xf32, #SparseMatrix>
|
|
|
|
// Call the kernels.
|
|
%0 = call @sparse_index_1d_conj(%sv) : (tensor<8xi64, #SparseVector>)
|
|
-> tensor<8xi64, #SparseVector>
|
|
%1 = call @sparse_index_1d_disj(%sv) : (tensor<8xi64, #SparseVector>)
|
|
-> tensor<8xi64, #SparseVector>
|
|
%2 = call @sparse_index_1d_conj(%dv) : (tensor<8xi64, #SparseVector>)
|
|
-> tensor<8xi64, #SparseVector>
|
|
%3 = call @sparse_index_1d_disj(%dv) : (tensor<8xi64, #SparseVector>)
|
|
-> tensor<8xi64, #SparseVector>
|
|
%4 = call @sparse_index_2d_conj(%sm) : (tensor<3x4xi64, #SparseMatrix>)
|
|
-> tensor<3x4xi64, #SparseMatrix>
|
|
%5 = call @sparse_index_2d_disj(%sm) : (tensor<3x4xi64, #SparseMatrix>)
|
|
-> tensor<3x4xi64, #SparseMatrix>
|
|
%6 = call @sparse_index_2d_conj(%dm) : (tensor<3x4xi64, #SparseMatrix>)
|
|
-> tensor<3x4xi64, #SparseMatrix>
|
|
%7 = call @sparse_index_2d_disj(%dm) : (tensor<3x4xi64, #SparseMatrix>)
|
|
-> tensor<3x4xi64, #SparseMatrix>
|
|
|
|
//
|
|
// Verify result.
|
|
//
|
|
// CHECK: 2
|
|
// CHECK-NEXT: 8
|
|
// CHECK-NEXT: 8
|
|
// CHECK-NEXT: 8
|
|
// CHECK-NEXT: 2
|
|
// CHECK-NEXT: 12
|
|
// CHECK-NEXT: 12
|
|
// CHECK-NEXT: 12
|
|
// CHECK-NEXT: ( 20, 80 )
|
|
// CHECK-NEXT: ( 0, 1, 12, 3, 24, 5, 6, 7 )
|
|
// CHECK-NEXT: ( 0, 2, 8, 24, 64, 160, 384, 896 )
|
|
// CHECK-NEXT: ( 1, 3, 6, 11, 20, 37, 70, 135 )
|
|
// CHECK-NEXT: ( 10, 120 )
|
|
// CHECK-NEXT: ( 0, 1, 2, 3, 1, 12, 3, 4, 2, 3, 4, 25 )
|
|
// CHECK-NEXT: ( 0, 0, 0, 0, 0, 2, 2, 3, 0, 2, 12, 24 )
|
|
// CHECK-NEXT: ( 1, 2, 3, 4, 2, 4, 4, 5, 3, 4, 7, 9 )
|
|
//
|
|
%n0 = sparse_tensor.number_of_entries %0 : tensor<8xi64, #SparseVector>
|
|
%n1 = sparse_tensor.number_of_entries %1 : tensor<8xi64, #SparseVector>
|
|
%n2 = sparse_tensor.number_of_entries %2 : tensor<8xi64, #SparseVector>
|
|
%n3 = sparse_tensor.number_of_entries %3 : tensor<8xi64, #SparseVector>
|
|
%n4 = sparse_tensor.number_of_entries %4 : tensor<3x4xi64, #SparseMatrix>
|
|
%n5 = sparse_tensor.number_of_entries %5 : tensor<3x4xi64, #SparseMatrix>
|
|
%n6 = sparse_tensor.number_of_entries %6 : tensor<3x4xi64, #SparseMatrix>
|
|
%n7 = sparse_tensor.number_of_entries %7 : tensor<3x4xi64, #SparseMatrix>
|
|
%8 = sparse_tensor.values %0 : tensor<8xi64, #SparseVector> to memref<?xi64>
|
|
%9 = sparse_tensor.values %1 : tensor<8xi64, #SparseVector> to memref<?xi64>
|
|
%10 = sparse_tensor.values %2 : tensor<8xi64, #SparseVector> to memref<?xi64>
|
|
%11 = sparse_tensor.values %3 : tensor<8xi64, #SparseVector> to memref<?xi64>
|
|
%12 = sparse_tensor.values %4 : tensor<3x4xi64, #SparseMatrix> to memref<?xi64>
|
|
%13 = sparse_tensor.values %5 : tensor<3x4xi64, #SparseMatrix> to memref<?xi64>
|
|
%14 = sparse_tensor.values %6 : tensor<3x4xi64, #SparseMatrix> to memref<?xi64>
|
|
%15 = sparse_tensor.values %7 : tensor<3x4xi64, #SparseMatrix> to memref<?xi64>
|
|
%16 = vector.transfer_read %8[%c0], %du: memref<?xi64>, vector<2xi64>
|
|
%17 = vector.transfer_read %9[%c0], %du: memref<?xi64>, vector<8xi64>
|
|
%18 = vector.transfer_read %10[%c0], %du: memref<?xi64>, vector<8xi64>
|
|
%19 = vector.transfer_read %11[%c0], %du: memref<?xi64>, vector<8xi64>
|
|
%20 = vector.transfer_read %12[%c0], %du: memref<?xi64>, vector<2xi64>
|
|
%21 = vector.transfer_read %13[%c0], %du: memref<?xi64>, vector<12xi64>
|
|
%22 = vector.transfer_read %14[%c0], %du: memref<?xi64>, vector<12xi64>
|
|
%23 = vector.transfer_read %15[%c0], %du: memref<?xi64>, vector<12xi64>
|
|
vector.print %n0 : index
|
|
vector.print %n1 : index
|
|
vector.print %n2 : index
|
|
vector.print %n3 : index
|
|
vector.print %n4 : index
|
|
vector.print %n5 : index
|
|
vector.print %n6 : index
|
|
vector.print %n7 : index
|
|
vector.print %16 : vector<2xi64>
|
|
vector.print %17 : vector<8xi64>
|
|
vector.print %18 : vector<8xi64>
|
|
vector.print %19 : vector<8xi64>
|
|
vector.print %20 : vector<2xi64>
|
|
vector.print %21 : vector<12xi64>
|
|
vector.print %22 : vector<12xi64>
|
|
vector.print %23 : vector<12xi64>
|
|
|
|
// Release resources.
|
|
bufferization.dealloc_tensor %sv : tensor<8xi64, #SparseVector>
|
|
bufferization.dealloc_tensor %dv : tensor<8xi64, #SparseVector>
|
|
bufferization.dealloc_tensor %0 : tensor<8xi64, #SparseVector>
|
|
bufferization.dealloc_tensor %1 : tensor<8xi64, #SparseVector>
|
|
bufferization.dealloc_tensor %2 : tensor<8xi64, #SparseVector>
|
|
bufferization.dealloc_tensor %3 : tensor<8xi64, #SparseVector>
|
|
bufferization.dealloc_tensor %sm : tensor<3x4xi64, #SparseMatrix>
|
|
bufferization.dealloc_tensor %dm : tensor<3x4xi64, #SparseMatrix>
|
|
bufferization.dealloc_tensor %4 : tensor<3x4xi64, #SparseMatrix>
|
|
bufferization.dealloc_tensor %5 : tensor<3x4xi64, #SparseMatrix>
|
|
bufferization.dealloc_tensor %6 : tensor<3x4xi64, #SparseMatrix>
|
|
bufferization.dealloc_tensor %7 : tensor<3x4xi64, #SparseMatrix>
|
|
|
|
//
|
|
// Call the f32 kernel, verify the result, release the resources.
|
|
//
|
|
// CHECK-NEXT: ( 0, 10, 0, 1, 1, 42 )
|
|
//
|
|
%100 = call @add_outer_2d(%sf32) : (tensor<2x3xf32, #SparseMatrix>)
|
|
-> tensor<2x3xf32, #SparseMatrix>
|
|
%101 = sparse_tensor.values %100 : tensor<2x3xf32, #SparseMatrix> to memref<?xf32>
|
|
%102 = vector.transfer_read %101[%c0], %df: memref<?xf32>, vector<6xf32>
|
|
vector.print %102 : vector<6xf32>
|
|
bufferization.dealloc_tensor %sf32 : tensor<2x3xf32, #SparseMatrix>
|
|
bufferization.dealloc_tensor %100 : tensor<2x3xf32, #SparseMatrix>
|
|
|
|
return
|
|
}
|
|
}
|