[mlir] add support for verification in integration tests

The patch extends the runner utils by verification methods that compare two memrefs. The methods compare the content of the two memrefs and print success if the data is identical up to a small numerical error. The methods are meant to simplify the development of integration tests that for example compare optimized and unoptimized code paths (cf. the updates to the linalg matmul integration tests). Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D96326
2021-02-09 17:43:11 +01:00
parent 56c446a878
commit 5fa893cc38
6 changed files with 235 additions and 25 deletions
--- a/mlir/include/mlir/ExecutionEngine/RunnerUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/RunnerUtils.h
@@ -31,6 +31,7 @@
 #endif // _WIN32

 #include <assert.h>
+#include <cmath>
 #include <iostream>

 #include "mlir/ExecutionEngine/CRunnerUtils.h"
@@ -73,11 +74,13 @@ namespace impl {
 template <typename T, int M, int... Dims>
 std::ostream &operator<<(std::ostream &os, const Vector<T, M, Dims...> &v);

-template <int... Dims> struct StaticSizeMult {
+template <int... Dims>
+struct StaticSizeMult {
  static constexpr int value = 1;
 };

-template <int N, int... Dims> struct StaticSizeMult<N, Dims...> {
+template <int N, int... Dims>
+struct StaticSizeMult<N, Dims...> {
  static constexpr int value = N * StaticSizeMult<Dims...>::value;
 };

@@ -87,7 +90,8 @@ static inline void printSpace(std::ostream &os, int count) {
  }
 }

-template <typename T, int M, int... Dims> struct VectorDataPrinter {
+template <typename T, int M, int... Dims>
+struct VectorDataPrinter {
  static void print(std::ostream &os, const Vector<T, M, Dims...> &val);
 };

@@ -211,6 +215,112 @@ void printMemRef(UnrankedMemRefType<T> &M) {
  std::cout << "Unranked Memref ";
  printMemRef(DynamicMemRefType<T>(M));
 }
+
+/// Verify the results of two computations are equivalent up to a small
+/// numerical error.
+template <typename T>
+struct MemRefDataVerifier {
+  /// Maximum number of errors printed by the verifier.
+  static constexpr int errorLimit = 10;
+
+  /// Verify the relative difference of the values is smaller than epsilon.
+  static bool verifyRelErrorSmallerThan(T actual, T expected, T epsilon);
+
+  /// Verify the values are equivalent (integers) or are close (floating-point).
+  static bool verifyElem(T actual, T expected);
+
+  /// Verify the data element-by-element.
+  static void verify(std::ostream &os, T *actualBasePtr, T *expectedBasePtr,
+                     int64_t dim, int64_t offset, const int64_t *sizes,
+                     const int64_t *strides, int64_t &errors);
+};
+
+template <typename T>
+bool MemRefDataVerifier<T>::verifyRelErrorSmallerThan(T actual, T expected,
+                                                      T epsilon) {
+  // Return an error if one of the values is infinite or NaN.
+  if (!std::isfinite(actual) || !std::isfinite(expected))
+    return false;
+  // Return true if the relative error is smaller than epsilon.
+  T delta = std::abs(actual - expected);
+  T maximum = std::max(std::abs(actual), std::abs(expected));
+  if (delta > epsilon * maximum)
+    return false;
+  return true;
+}
+
+template <typename T>
+bool MemRefDataVerifier<T>::verifyElem(T actual, T expected) {
+  return actual == expected;
+}
+
+template <>
+inline bool MemRefDataVerifier<double>::verifyElem(double actual,
+                                                   double expected) {
+  return verifyRelErrorSmallerThan(actual, expected, 1e-12);
+}
+
+template <>
+inline bool MemRefDataVerifier<float>::verifyElem(float actual,
+                                                  float expected) {
+  return verifyRelErrorSmallerThan(actual, expected, 1e-6);
+}
+
+template <typename T>
+void MemRefDataVerifier<T>::verify(std::ostream &os, T *actualBasePtr,
+                                   T *expectedBasePtr, int64_t dim,
+                                   int64_t offset, const int64_t *sizes,
+                                   const int64_t *strides, int64_t &errors) {
+  // Verify the elements at the current offset.
+  if (dim == 0) {
+    if (!verifyElem(actualBasePtr[offset], expectedBasePtr[offset])) {
+      if (errors < errorLimit) {
+        os << actualBasePtr[offset] << " != " << expectedBasePtr[offset]
+           << " offset = " << offset << "\n";
+      } else if (errors == errorLimit) {
+        os << "...\n";
+      }
+      errors++;
+    }
+    return;
+  }
+  // Iterate the current dimension and verify recursively.
+  for (int64_t i = 0; i < sizes[0]; ++i) {
+    verify(os, actualBasePtr, expectedBasePtr, dim - 1, offset + i * strides[0],
+           sizes + 1, strides + 1, errors);
+  }
+}
+
+/// Verify the equivalence of two dynamic memrefs.
+template <typename T>
+int64_t verifyMemRef(const DynamicMemRefType<T> &actual,
+                     const DynamicMemRefType<T> &expected) {
+  // Check the shapes of the MemRefs match.
+  for (int64_t i = 0; i < actual.rank; ++i) {
+    if (expected.rank != actual.rank || actual.offset != expected.offset ||
+        actual.sizes[i] != expected.sizes[i] ||
+        actual.strides[i] != expected.strides[i]) {
+      printMemRefMetaData(std::cerr, actual);
+      printMemRefMetaData(std::cerr, expected);
+      return -1;
+    }
+  }
+  // Count the errors and print the verification result.
+  int64_t errors = 0;
+  MemRefDataVerifier<T>::verify(std::cerr, actual.basePtr, expected.basePtr,
+                                actual.rank, actual.offset, actual.sizes,
+                                actual.strides, errors);
+  return errors;
+}
+
+/// Verify the equivalence of two unranked memrefs.
+template <typename T>
+int64_t verifyMemRef(UnrankedMemRefType<T> &actual,
+                     UnrankedMemRefType<T> &expected) {
+  return verifyMemRef(DynamicMemRefType<T>(actual),
+                      DynamicMemRefType<T>(expected));
+}
+
 } // namespace impl

 ////////////////////////////////////////////////////////////////////////////////
@@ -247,4 +357,21 @@ extern "C" MLIR_RUNNERUTILS_EXPORT void
 _mlir_ciface_print_memref_vector_4x4xf32(
    StridedMemRefType<Vector2D<4, 4, float>, 2> *M);

+extern "C" MLIR_RUNNERUTILS_EXPORT int64_t _mlir_ciface_verifyMemRefI32(
+    UnrankedMemRefType<int32_t> *actual, UnrankedMemRefType<int32_t> *expected);
+extern "C" MLIR_RUNNERUTILS_EXPORT int64_t _mlir_ciface_verifyMemRefF32(
+    UnrankedMemRefType<float> *actual, UnrankedMemRefType<float> *expected);
+extern "C" MLIR_RUNNERUTILS_EXPORT int64_t _mlir_ciface_verifyMemRefF64(
+    UnrankedMemRefType<double> *actual, UnrankedMemRefType<double> *expected);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT int64_t verifyMemRefI32(int64_t rank,
+                                                           void *actualPtr,
+                                                           void *expectedPtr);
+extern "C" MLIR_RUNNERUTILS_EXPORT int64_t verifyMemRefF32(int64_t rank,
+                                                           void *actualPtr,
+                                                           void *expectedPtr);
+extern "C" MLIR_RUNNERUTILS_EXPORT int64_t verifyMemRefF64(int64_t rank,
+                                                           void *actualPtr,
+                                                           void *expectedPtr);
+
 #endif // EXECUTIONENGINE_RUNNERUTILS_H_
--- a/mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul.mlir
+++ b/mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul.mlir
@@ -1,6 +1,6 @@
 // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
 // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
+// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \
 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.copy register-tile-sizes=4,32 vectorize" | \

@@ -9,6 +9,7 @@
 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
 // Activate to dump assembly
 // R_UN:   -dump-object-file -object-filename=/tmp/a.o \
+// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // Use tee to both print to stderr and FileCheck
 // RUN: tee -a /dev/stderr | FileCheck %s
@@ -87,10 +88,17 @@ func @main() {
  %tmatmul = subf %t_end_matmul, %t_start_matmul: f64
  call @print_perf(%iters, %tmatmul) : (index, f64) -> ()

-  %res = load %C[%c0, %c0]: !row_major_C
-  // CHECK: 64
-  vector.print %res: f32
-
+  // CHECK: {{^0}}
+  %C_ref = alloc() : !row_major_C
+  linalg.fill(%C_ref, %v0) : !row_major_C, !elem_type_c
+  linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
+    outs(%C_ref: !row_major_C)
+  %act = memref_cast %C : !row_major_C to memref<*xf32>
+  %exp = memref_cast %C_ref : !row_major_C to memref<*xf32>
+  %errors = call @verifyMemRefF32(%act, %exp) : (memref<*xf32>, memref<*xf32>) -> i64
+  vector.print %errors : i64
+  dealloc %C_ref : !row_major_C
+  
  dealloc %A : !row_major_A
  dealloc %B : !row_major_B
  dealloc %C : !row_major_C
@@ -99,6 +107,7 @@ func @main() {
 }

 func private @rtclock() -> f64
+func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface }

 // TODO: init with random, run and check output.
 // func private @fill_random_f32(memref<*xf32>)
--- a/mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major.mlir
+++ b/mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major.mlir
@@ -1,6 +1,6 @@
 // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
 // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \
+// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \
 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,16 vectorize" | \

 // TODO: linalg.copy vectorization in the presence of permutation map fails. Enable when addressed.
@@ -11,6 +11,7 @@
 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
 // Activate to dump assembly
 // R_UN:   -dump-object-file -object-filename=/tmp/a.o \
+// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // Use tee to both print to stderr and FileCheck
 // RUN: tee -a /dev/stderr | FileCheck %s
@@ -84,9 +85,16 @@ func @main() {
  %tmatmul_column_major = subf %t_end_matmul_column_major, %t_start_matmul_column_major: f64
  call @print_perf(%iters, %tmatmul_column_major) : (index, f64) -> ()

-  %res = load %cC[%c0, %c0]: !column_major_C
-  // CHECK: 64
-  vector.print %res: !elem_type_c
+  // CHECK: {{^0}}
+  %cC_ref = alloc() : !column_major_C
+  linalg.fill(%cC_ref, %f0) : !column_major_C, !elem_type_c
+  linalg.matmul_column_major ins(%cA, %cB : !column_major_A, !column_major_B)
+    outs(%cC_ref: !column_major_C)
+  %act = memref_cast %cC : !column_major_C to memref<*xf32>
+  %exp = memref_cast %cC_ref : !column_major_C to memref<*xf32>
+  %errors = call @verifyMemRefF32(%act, %exp) : (memref<*xf32>, memref<*xf32>) -> i64
+  vector.print %errors : i64
+  dealloc %cC_ref : !column_major_C

  dealloc %cA : !column_major_A
  dealloc %cB : !column_major_B
@@ -96,6 +104,7 @@ func @main() {
 }

 func private @rtclock() -> f64
+func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface }

 // TODO: init with random, run and check output.
 // func private @fill_random_f32(memref<*xf32>)
--- a/mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir
+++ b/mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir
@@ -1,7 +1,7 @@
 // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
 // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
+// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major_as_row_major anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \
+// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major_as_row_major anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,16 vectorize" | \

 // TODO: linalg.copy vectorization in the presence of permutation map fails. Enable when addressed.
@@ -12,6 +12,7 @@
 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
 // Activate to dump assembly
 // R_UN:   -dump-object-file -object-filename=/tmp/a.o \
+// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // Use tee to both print to stderr and FileCheck
 // RUN: tee -a /dev/stderr | FileCheck %s
@@ -63,11 +64,11 @@ func @print_perf(%iters: index, %total_time: f64) {
 func @main() {
  %f0 = constant 0.0 : !elem_type_c
  %f1 = constant 1.0 : !elem_type_a
-
+  
  %cA = alloc() : !column_major_A
  %cB = alloc() : !column_major_B
  %cC = alloc() : !column_major_C
-
+  
  linalg.fill(%cA, %f1) : !column_major_A, !elem_type_a
  linalg.fill(%cB, %f1) : !column_major_B, !elem_type_b
  linalg.fill(%cC, %f0) : !column_major_C, !elem_type_c
@@ -95,13 +96,28 @@ func @main() {
  %tmatmul_column_major_as_row_major = subf %t_end_matmul_column_major_as_row_major, %t_start_matmul_column_major_as_row_major: f64
  call @print_perf(%iters, %tmatmul_column_major_as_row_major) : (index, f64) -> ()

-  %res = load %cC[%c0, %c0]: !column_major_C
-  // CHECK: 64
-  vector.print %res: !elem_type_c
-  %res2 = load %C[%c0, %c0]: !row_major_C
-  // CHECK: 64
-  vector.print %res2: !elem_type_c
+  // CHECK: {{^0}}
+  %cC_ref = alloc() : !column_major_C
+  linalg.fill(%cC_ref, %f0) : !column_major_C, !elem_type_c
+  linalg.matmul_column_major ins(%cA, %cB : !column_major_A, !column_major_B)
+    outs(%cC_ref: !column_major_C)
+  %act1 = memref_cast %cC : !column_major_C to memref<*xf32>
+  %exp1 = memref_cast %cC_ref : !column_major_C to memref<*xf32>
+  %errors1 = call @verifyMemRefF32(%act1, %exp1) : (memref<*xf32>, memref<*xf32>) -> i64
+  vector.print %errors1 : i64
+  dealloc %cC_ref : !column_major_C

+  // CHECK: {{^0}}
+  %C_ref = alloc() : !row_major_C
+  linalg.fill(%C_ref, %f0) : !row_major_C, !elem_type_c
+  linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
+    outs(%C_ref: !row_major_C)
+  %act2 = memref_cast %C : !row_major_C to memref<*xf32>
+  %exp2 = memref_cast %C_ref : !row_major_C to memref<*xf32>
+  %errors2 = call @verifyMemRefF32(%act2, %exp2) : (memref<*xf32>, memref<*xf32>) -> i64
+  vector.print %errors2 : i64
+  dealloc %C_ref : !row_major_C
+  
  dealloc %A : !row_major_A
  dealloc %B : !row_major_B
  dealloc %C : !row_major_C
@@ -114,6 +130,7 @@ func @main() {
 }

 func private @rtclock() -> f64
+func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface }

 // TODO: init with random, run and check output.
 // func private @fill_random_f32(memref<*xf32>)
--- a/mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir
+++ b/mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir
@@ -9,6 +9,7 @@
 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
 // Activate to dump assembly
 // R_UN:   -dump-object-file -object-filename=/tmp/a.o \
+// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // Use tee to both print to stderr and FileCheck
 // RUN: tee -a /dev/stderr | FileCheck %s
@@ -85,9 +86,16 @@ func @main() {
  %tmatmul = subf %t_end_matmul, %t_start_matmul: f64
  call @print_perf(%iters, %tmatmul) : (index, f64) -> ()

-  %res = load %C[%c0, %c0]: !row_major_C
-  // CHECK: 64
-  vector.print %res: !elem_type_c
+  // CHECK: {{^0}}
+  %C_ref = alloc() : !row_major_C
+  linalg.fill(%C_ref, %v0) : !row_major_C, !elem_type_c
+  linalg.matmul_i8_i8_i32 ins(%A, %B : !row_major_A, !row_major_B)
+    outs(%C_ref: !row_major_C)
+  %res = memref_cast %C : !row_major_C to memref<*xi32>
+  %exp = memref_cast %C_ref : !row_major_C to memref<*xi32>
+  %errors = call @verifyMemRefI32(%res, %exp) : (memref<*xi32>, memref<*xi32>) -> i64
+  vector.print %errors : i64
+  dealloc %C_ref : !row_major_C

  dealloc %A : !row_major_A
  dealloc %B : !row_major_B
@@ -97,6 +105,7 @@ func @main() {
 }

 func private @rtclock() -> f64
+func private @verifyMemRefI32(memref<*xi32>, memref<*xi32>) -> i64 attributes { llvm.emit_c_interface }

 // TODO: init with random, run and check output.
 // func private @fill_random_f32(memref<*xf32>)
--- a/mlir/lib/ExecutionEngine/RunnerUtils.cpp
+++ b/mlir/lib/ExecutionEngine/RunnerUtils.cpp
@@ -80,3 +80,42 @@ extern "C" void
 _mlir_ciface_print_memref_4d_f32(StridedMemRefType<float, 4> *M) {
  impl::printMemRef(*M);
 }
+
+extern "C" int64_t
+_mlir_ciface_verifyMemRefI32(UnrankedMemRefType<int32_t> *actual,
+                             UnrankedMemRefType<int32_t> *expected) {
+  return impl::verifyMemRef(*actual, *expected);
+}
+
+extern "C" int64_t
+_mlir_ciface_verifyMemRefF32(UnrankedMemRefType<float> *actual,
+                             UnrankedMemRefType<float> *expected) {
+  return impl::verifyMemRef(*actual, *expected);
+}
+
+extern "C" int64_t
+_mlir_ciface_verifyMemRefF64(UnrankedMemRefType<double> *actual,
+                             UnrankedMemRefType<double> *expected) {
+  return impl::verifyMemRef(*actual, *expected);
+}
+
+extern "C" int64_t verifyMemRefI32(int64_t rank, void *actualPtr,
+                                   void *expectedPtr) {
+  UnrankedMemRefType<int32_t> actualDesc = {rank, actualPtr};
+  UnrankedMemRefType<int32_t> expectedDesc = {rank, expectedPtr};
+  return _mlir_ciface_verifyMemRefI32(&actualDesc, &expectedDesc);
+}
+
+extern "C" int64_t verifyMemRefF32(int64_t rank, void *actualPtr,
+                                   void *expectedPtr) {
+  UnrankedMemRefType<float> actualDesc = {rank, actualPtr};
+  UnrankedMemRefType<float> expectedDesc = {rank, expectedPtr};
+  return _mlir_ciface_verifyMemRefF32(&actualDesc, &expectedDesc);
+}
+
+extern "C" int64_t verifyMemRefF64(int64_t rank, void *actualPtr,
+                                   void *expectedPtr) {
+  UnrankedMemRefType<double> actualDesc = {rank, actualPtr};
+  UnrankedMemRefType<double> expectedDesc = {rank, expectedPtr};
+  return _mlir_ciface_verifyMemRefF64(&actualDesc, &expectedDesc);
+}