We can process the long shuffles (working across several actual vector registers) in the best way if we take the actual register represantion into account. We can build more correct representation of register shuffles, improve number of recognised buildvector sequences. Also, same function can be used to improve the cost model for the shuffles. in future patches. Part of D100486 Differential Revision: https://reviews.llvm.org/D115653
32 lines
1.0 KiB
LLVM
32 lines
1.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s
|
|
|
|
define <4 x i64> @autogen_SD88863() {
|
|
; CHECK-LABEL: autogen_SD88863:
|
|
; CHECK: # %bb.0: # %BB
|
|
; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
|
|
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
|
|
; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
|
|
; CHECK-NEXT: movb $1, %al
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB0_1: # %CF
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: testb %al, %al
|
|
; CHECK-NEXT: jne .LBB0_1
|
|
; CHECK-NEXT: # %bb.2: # %CF240
|
|
; CHECK-NEXT: ret{{[l|q]}}
|
|
BB:
|
|
%I26 = insertelement <4 x i64> undef, i64 undef, i32 2
|
|
br label %CF
|
|
|
|
CF:
|
|
%E66 = extractelement <4 x i64> %I26, i32 1
|
|
%I68 = insertelement <4 x i64> zeroinitializer, i64 %E66, i32 2
|
|
%Cmp72 = icmp eq i32 0, 0
|
|
br i1 %Cmp72, label %CF, label %CF240
|
|
|
|
CF240:
|
|
ret <4 x i64> %I68
|
|
}
|