Files
clang-p2996/mlir/test/python/dialects/sparse_tensor/test_SpMM.py
Alex Zinenko 8b58ab8ccd [mlir] Factor type reconciliation out of Standard-to-LLVM conversion
Conversion to the LLVM dialect is being refactored to be more progressive and
is now performed as a series of independent passes converting different
dialects. These passes may produce `unrealized_conversion_cast` operations that
represent pending conversions between built-in and LLVM dialect types.
Historically, a more monolithic Standard-to-LLVM conversion pass did not need
these casts as all operations were converted in one shot. Previous refactorings
have led to the requirement of running the Standard-to-LLVM conversion pass to
clean up `unrealized_conversion_cast`s even though the IR had no standard
operations in it. The pass must have been also run the last among all to-LLVM
passes, in contradiction with the partial conversion logic. Additionally, the
way it was set up could produce invalid operations by removing casts between
LLVM and built-in types even when the consumer did not accept the uncasted
type, or could lead to cryptic conversion errors (recursive application of the
rewrite pattern on `unrealized_conversion_cast` as a means to indicate failure
to eliminate casts).

In fact, the need to eliminate A->B->A `unrealized_conversion_cast`s is not
specific to to-LLVM conversions and can be factored out into a separate type
reconciliation pass, which is achieved in this commit. While the cast operation
itself has a folder pattern, it is insufficient in most conversion passes as
the folder only applies to the second cast. Without complex legality setup in
the conversion target, the conversion infra will either consider the cast
operations valid and not fold them (a separate canonicalization would be
necessary to trigger the folding), or consider the first cast invalid upon
generation and stop with error. The pattern provided by the reconciliation pass
applies to the first cast operation instead. Furthermore, having a separate
pass makes it clear when `unrealized_conversion_cast`s could not have been
eliminated since it is the only reason why this pass can fail.

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D109507
2021-09-09 16:51:24 +02:00

176 lines
5.8 KiB
Python

# RUN: SUPPORT_LIB=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext %PYTHON %s | FileCheck %s
import ctypes
import numpy as np
import os
import mlir.all_passes_registration
from mlir import ir
from mlir import runtime as rt
from mlir import execution_engine
from mlir import passmanager
from mlir.dialects import sparse_tensor as st
from mlir.dialects import builtin
from mlir.dialects.linalg.opdsl import lang as dsl
def run(f):
print('\nTEST:', f.__name__)
f()
return f
@dsl.linalg_structured_op
def matmul_dsl(
A=dsl.TensorDef(dsl.T, dsl.S.M, dsl.S.K),
B=dsl.TensorDef(dsl.T, dsl.S.K, dsl.S.N),
C=dsl.TensorDef(dsl.T, dsl.S.M, dsl.S.N, output=True)):
C[dsl.D.m, dsl.D.n] += A[dsl.D.m, dsl.D.k] * B[dsl.D.k, dsl.D.n]
def build_SpMM(attr: st.EncodingAttr):
"""Build SpMM kernel.
This method generates a linalg op with for matrix multiplication using
just the Python API. Effectively, a generic linalg op is constructed
that computes C(i,j) += A(i,k) * B(k,j) for annotated matrix A.
"""
module = ir.Module.create()
f64 = ir.F64Type.get()
a = ir.RankedTensorType.get([3, 4], f64, attr)
b = ir.RankedTensorType.get([4, 2], f64)
c = ir.RankedTensorType.get([3, 2], f64)
arguments = [a, b, c]
with ir.InsertionPoint(module.body):
@builtin.FuncOp.from_py_func(*arguments)
def spMxM(*args):
return matmul_dsl(args[0], args[1], outs=[args[2]])
return module
def boilerplate(attr: st.EncodingAttr):
"""Returns boilerplate main method.
This method sets up a boilerplate main method that takes three tensors
(a, b, c), converts the first tensor a into s sparse tensor, and then
calls the sparse kernel for matrix multiplication. For convenience,
this part is purely done as string input.
"""
return f"""
func @main(%ad: tensor<3x4xf64>, %b: tensor<4x2xf64>, %c: tensor<3x2xf64>) -> tensor<3x2xf64>
attributes {{ llvm.emit_c_interface }} {{
%a = sparse_tensor.convert %ad : tensor<3x4xf64> to tensor<3x4xf64, {attr}>
%0 = call @spMxM(%a, %b, %c) : (tensor<3x4xf64, {attr}>,
tensor<4x2xf64>,
tensor<3x2xf64>) -> tensor<3x2xf64>
return %0 : tensor<3x2xf64>
}}
"""
def build_compile_and_run_SpMM(attr: st.EncodingAttr, support_lib: str,
compiler):
# Build.
module = build_SpMM(attr)
func = str(module.operation.regions[0].blocks[0].operations[0].operation)
module = ir.Module.parse(func + boilerplate(attr))
# Compile.
compiler(module)
engine = execution_engine.ExecutionEngine(
module, opt_level=0, shared_libs=[support_lib])
# Set up numpy input and buffer for output.
a = np.array(
[[1.1, 0.0, 0.0, 1.4], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 3.3, 0.0]],
np.float64)
b = np.array([[1.0, 2.0], [4.0, 3.0], [5.0, 6.0], [8.0, 7.0]], np.float64)
c = np.zeros((3, 2), np.float64)
out = np.zeros((3, 2), np.float64)
mem_a = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(a)))
mem_b = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(b)))
mem_c = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(c)))
mem_out = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(out)))
# Invoke the kernel and get numpy output.
# Built-in bufferization uses in-out buffers.
# TODO: replace with inplace comprehensive bufferization.
engine.invoke('main', mem_out, mem_a, mem_b, mem_c)
# Sanity check on computed result.
expected = np.matmul(a, b);
c = rt.ranked_memref_to_numpy(mem_out[0])
if np.allclose(c, expected):
pass
else:
quit(f'FAILURE')
class SparseCompiler:
"""Sparse compiler passes."""
def __init__(self, options: str):
pipeline = (
f'sparsification{{{options}}},'
f'sparse-tensor-conversion,'
f'builtin.func(convert-linalg-to-loops,convert-vector-to-scf),'
f'convert-scf-to-std,'
f'func-bufferize,'
f'tensor-constant-bufferize,'
f'builtin.func(tensor-bufferize,std-bufferize,finalizing-bufferize),'
f'convert-vector-to-llvm{{reassociate-fp-reductions=1 enable-index-optimizations=1}},'
f'convert-memref-to-llvm,'
f'convert-std-to-llvm,'
f'reconcile-unrealized-casts')
self.pipeline = pipeline
def __call__(self, module: ir.Module):
passmanager.PassManager.parse(self.pipeline).run(module)
# CHECK-LABEL: TEST: testSpMM
# CHECK: Passed 72 tests
@run
def testSpMM():
# Obtain path to runtime support library.
support_lib = os.getenv('SUPPORT_LIB')
assert os.path.exists(support_lib), f'{support_lib} does not exist'
with ir.Context() as ctx, ir.Location.unknown():
count = 0
# Fixed compiler optimization strategy.
# TODO: explore state space here too
par = 0
vec = 0
vl = 1
e = False
opt = (f'parallelization-strategy={par} '
f'vectorization-strategy={vec} '
f'vl={vl} enable-simd-index32={e}')
# Exhaustive loop over various ways to annotate a kernel with
# a *single* sparse tensor. Even this subset already gives
# quite a large state space!
levels = [[st.DimLevelType.dense, st.DimLevelType.dense],
[st.DimLevelType.dense, st.DimLevelType.compressed],
[st.DimLevelType.compressed, st.DimLevelType.dense],
[st.DimLevelType.compressed, st.DimLevelType.compressed]]
orderings = [
ir.AffineMap.get_permutation([0, 1]),
ir.AffineMap.get_permutation([1, 0])
]
bitwidths = [0, 8, 32]
for level in levels:
for ordering in orderings:
for pwidth in bitwidths:
for iwidth in bitwidths:
attr = st.EncodingAttr.get(level, ordering, pwidth, iwidth)
compiler = SparseCompiler(options=opt)
build_compile_and_run_SpMM(attr, support_lib, compiler)
count = count + 1
print('Passed ', count, 'tests')