This build vector lowering pattern came up in D79886. I've tried to limit the improvement to cases where it looks clearly better to load, but we could remove the 'TODO' predicates already if we are willing to overlook some corner cases. Differential Revision: https://reviews.llvm.org/D80013
42 lines
1.5 KiB
LLVM
42 lines
1.5 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
|
|
|
|
define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind {
|
|
; X32-LABEL: t1:
|
|
; X32: # %bb.0:
|
|
; X32-NEXT: psrlw {{\.LCPI.*}}, %xmm0
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: t1:
|
|
; X64: # %bb.0:
|
|
; X64-NEXT: psrlw {{.*}}(%rip), %xmm0
|
|
; X64-NEXT: retq
|
|
%tmp1 = bitcast <2 x i64> %b1 to <8 x i16>
|
|
%tmp2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w( <8 x i16> %tmp1, <8 x i16> bitcast (<4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > to <8 x i16>) ) nounwind readnone
|
|
%tmp3 = bitcast <8 x i16> %tmp2 to <2 x i64>
|
|
ret <2 x i64> %tmp3
|
|
}
|
|
|
|
define <4 x i32> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind {
|
|
; X32-LABEL: t2:
|
|
; X32: # %bb.0:
|
|
; X32-NEXT: movl $14, %eax
|
|
; X32-NEXT: movd %eax, %xmm1
|
|
; X32-NEXT: pslld %xmm1, %xmm0
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: t2:
|
|
; X64: # %bb.0:
|
|
; X64-NEXT: movl $14, %eax
|
|
; X64-NEXT: movd %eax, %xmm1
|
|
; X64-NEXT: pslld %xmm1, %xmm0
|
|
; X64-NEXT: retq
|
|
%tmp1 = bitcast <2 x i64> %b1 to <4 x i32>
|
|
%tmp2 = tail call <4 x i32> @llvm.x86.sse2.psll.d( <4 x i32> %tmp1, <4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > ) nounwind readnone
|
|
ret <4 x i32> %tmp2
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
|