IR typically creates INSERT_SUBVECTOR patterns as a widening of the subvector with undefs to pad to the destination size, followed by a shuffle for the actual insertion - SelectionDAGBuilder has to do something similar for shuffles when source/destination vectors are different sizes. This combine attempts to recognize these patterns by looking for a shuffle of a subvector (from a CONCAT_VECTORS) that starts at a modulo of its size into an otherwise identity shuffle of the base vector. This uncovered a couple of target-specific issues as we haven't often created INSERT_SUBVECTOR nodes in generic code - aarch64 could only handle insertions into the bottom of undefs (i.e. a vector widening), and x86-avx512 vXi1 insertion wasn't keeping track of undef elements in the base vector. Fixes PR50053 Differential Revision: https://reviews.llvm.org/D107068
61 lines
3.5 KiB
LLVM
61 lines
3.5 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 -O0 | FileCheck %s
|
|
|
|
define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <16 x i64> %arg3, <16 x i64> %arg4) {
|
|
; CHECK-LABEL: pluto:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: pushq %rbp
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
; CHECK-NEXT: andq $-32, %rsp
|
|
; CHECK-NEXT: subq $32, %rsp
|
|
; CHECK-NEXT: vmovaps %ymm4, %ymm10
|
|
; CHECK-NEXT: vmovaps %ymm3, %ymm9
|
|
; CHECK-NEXT: vmovaps %ymm1, %ymm8
|
|
; CHECK-NEXT: vmovaps %ymm0, %ymm4
|
|
; CHECK-NEXT: vmovaps 240(%rbp), %ymm1
|
|
; CHECK-NEXT: vmovaps 208(%rbp), %ymm3
|
|
; CHECK-NEXT: vmovaps 176(%rbp), %ymm0
|
|
; CHECK-NEXT: vmovaps 144(%rbp), %ymm0
|
|
; CHECK-NEXT: vmovaps 112(%rbp), %ymm11
|
|
; CHECK-NEXT: vmovaps 80(%rbp), %ymm11
|
|
; CHECK-NEXT: vmovaps 48(%rbp), %ymm11
|
|
; CHECK-NEXT: vmovaps 16(%rbp), %ymm11
|
|
; CHECK-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm2[6,7]
|
|
; CHECK-NEXT: vmovaps %xmm3, %xmm6
|
|
; CHECK-NEXT: # implicit-def: $ymm2
|
|
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
|
|
; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
|
|
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0]
|
|
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
|
|
; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2
|
|
; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm2[0],zero
|
|
; CHECK-NEXT: # implicit-def: $ymm2
|
|
; CHECK-NEXT: vmovaps %xmm6, %xmm2
|
|
; CHECK-NEXT: # kill: def $xmm4 killed $xmm4 killed $ymm4
|
|
; CHECK-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
|
|
; CHECK-NEXT: vmovaps %xmm7, %xmm4
|
|
; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
|
|
; CHECK-NEXT: # implicit-def: $ymm4
|
|
; CHECK-NEXT: vmovaps %xmm6, %xmm4
|
|
; CHECK-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
|
|
; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3]
|
|
; CHECK-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
|
|
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7]
|
|
; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
|
|
; CHECK-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,1,4,5,4,5]
|
|
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
; CHECK-NEXT: popq %rbp
|
|
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
%tmp = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <16 x i64> %arg, <16 x i64> %arg1
|
|
%tmp5 = select <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i64> %arg2, <16 x i64> zeroinitializer
|
|
%tmp6 = select <16 x i1> <i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true>, <16 x i64> %arg3, <16 x i64> %tmp5
|
|
%tmp7 = shufflevector <16 x i64> %tmp, <16 x i64> %tmp6, <16 x i32> <i32 11, i32 18, i32 24, i32 9, i32 14, i32 29, i32 29, i32 6, i32 14, i32 28, i32 8, i32 9, i32 22, i32 12, i32 25, i32 6>
|
|
ret <16 x i64> %tmp7
|
|
}
|