Files
clang-p2996/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
Caroline Concatto d515ecca68 [IR] Add new intrinsics interleave and deinterleave vectors
This patch adds 2 new intrinsics:

  ; Interleave two vectors into a wider vector
  <vscale x 4 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 2 x i64> %even, <vscale x 2 x i64> %odd)

  ; Deinterleave the odd and even lanes from a wider vector
  {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv2i64(<vscale x 4 x i64> %vec)

The main motivator for adding these intrinsics is to support vectorization of
complex types using scalable vectors.

The intrinsics are kept simple by only supporting a stride of 2, which makes
them easy to lower and type-legalize. A stride of 2 is sufficient to handle
complex types which only have a real/imaginary component.

The format of the intrinsics matches how `shufflevector` is used in
LoopVectorize. For example:

  using cf = std::complex<float>;

  void foo(cf * dst, int N) {
      for (int i=0; i<N; ++i)
          dst[i] += cf(1.f, 2.f);
  }

For this loop, LoopVectorize:
  (1) Loads a wide vector (e.g. <8 x float>)
  (2) Extracts odd lanes using shufflevector (leading to <4 x float>)
  (3) Extracts even lanes using shufflevector (leading to <4 x float>)
  (4) Performs the addition
  (5) Interleaves the two <4 x float> vectors into a single <8 x float> using
      shufflevector
  (6) Stores the wide vector.

In this example, we can 1-1 replace shufflevector in (2) and (3) with the
deinterleave intrinsic, and replace the shufflevector in (5) with the
interleave intrinsic.

The SelectionDAG nodes might be extended to support higher strides (3, 4, etc)
as well in the future.

Similar to what was done for vector.splice and vector.reverse, the intrinsic
is lowered to a shufflevector when the type is fixed width, so to benefit from
existing code that was written to recognize/optimize shufflevector patterns.

Note that this approach does not prevent us from adding new intrinsics for other
strides, or adding a more generic shuffle intrinsic in the future. It just solves
the immediate problem of being able to vectorize loops with complex math.

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D141924
2023-02-20 12:21:59 +00:00

134 lines
5.2 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
; CHECK-LABEL: interleave2_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%retval = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
ret <4 x half> %retval
}
define <8 x half> @interleave2_v8f16(<4 x half> %vec0, <4 x half> %vec1) {
; CHECK-LABEL: interleave2_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
; CHECK-NEXT: ret
%retval = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1)
ret <8 x half> %retval
}
define <16 x half> @interleave2_v16f16(<8 x half> %vec0, <8 x half> %vec1) {
; CHECK-LABEL: interleave2_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h
; CHECK-NEXT: zip2 v1.8h, v0.8h, v1.8h
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%retval = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %vec0, <8 x half> %vec1)
ret <16 x half> %retval
}
define <4 x float> @interleave2_v4f32(<2 x float> %vec0, <2 x float> %vec1) {
; CHECK-LABEL: interleave2_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: rev64 v1.4s, v0.4s
; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%retval = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1)
ret <4 x float> %retval
}
define <8 x float> @interleave2_v8f32(<4 x float> %vec0, <4 x float> %vec1) {
; CHECK-LABEL: interleave2_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s
; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%retval = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %vec0, <4 x float> %vec1)
ret <8 x float> %retval
}
define <4 x double> @interleave2_v4f64(<2 x double> %vec0, <2 x double> %vec1) {
; CHECK-LABEL: interleave2_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d
; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%retval = call <4 x double>@llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec0, <2 x double> %vec1)
ret <4 x double> %retval
}
; Integers
define <32 x i8> @interleave2_v32i8(<16 x i8> %vec0, <16 x i8> %vec1) {
; CHECK-LABEL: interleave2_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 v2.16b, v0.16b, v1.16b
; CHECK-NEXT: zip2 v1.16b, v0.16b, v1.16b
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%retval = call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %vec0, <16 x i8> %vec1)
ret <32 x i8> %retval
}
define <16 x i16> @interleave2_v16i16(<8 x i16> %vec0, <8 x i16> %vec1) {
; CHECK-LABEL: interleave2_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h
; CHECK-NEXT: zip2 v1.8h, v0.8h, v1.8h
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%retval = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %vec0, <8 x i16> %vec1)
ret <16 x i16> %retval
}
define <8 x i32> @interleave2_v8i32(<4 x i32> %vec0, <4 x i32> %vec1) {
; CHECK-LABEL: interleave2_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s
; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%retval = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %vec0, <4 x i32> %vec1)
ret <8 x i32> %retval
}
define <4 x i64> @interleave2_v4i64(<2 x i64> %vec0, <2 x i64> %vec1) {
; CHECK-LABEL: interleave2_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 v2.2d, v0.2d, v1.2d
; CHECK-NEXT: zip2 v1.2d, v0.2d, v1.2d
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%retval = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %vec0, <2 x i64> %vec1)
ret <4 x i64> %retval
}
; Float declarations
declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>)
declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
; Integer declarations
declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)