Files
clang-p2996/llvm/test/CodeGen/X86/vec_insert-mmx.ll
Craig Topper ec096a1dae [X86] Custom type legalize v2i32/v4i16/v8i8->i64 bitcasts in 64-bit mode similar to what's done when the destination is f64.
The generic legalizer will fall back to a stack spill that uses a truncating store. That store will get expanded into a shuffle and non-truncating store on pre-avx512 targets. Once that happens the stack store/load pair will be combined away leaving behind the shuffle and bitcasts. On avx512 targets the truncating store is legal so doesn't get folded away.

By custom legalizing it we can avoid this churn and maybe produce better code.

llvm-svn: 348085
2018-12-02 05:46:48 +00:00

90 lines
2.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s --check-prefix=X64
; This is not an MMX operation; promoted to xmm.
define x86_mmx @t0(i32 %A) nounwind {
; X32-LABEL: t0:
; X32: ## %bb.0:
; X32-NEXT: movd {{[0-9]+}}(%esp), %mm1
; X32-NEXT: pxor %mm0, %mm0
; X32-NEXT: punpckldq %mm1, %mm0 ## mm0 = mm0[0],mm1[0]
; X32-NEXT: retl
;
; X64-LABEL: t0:
; X64: ## %bb.0:
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; X64-NEXT: retq
%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
%tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
ret x86_mmx %tmp4
}
define <8 x i8> @t1(i8 zeroext %x) nounwind {
; X32-LABEL: t1:
; X32: ## %bb.0:
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: t1:
; X64: ## %bb.0:
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: retq
%r = insertelement <8 x i8> undef, i8 %x, i32 0
ret <8 x i8> %r
}
; PR2574
define <2 x float> @t2(<2 x float> %a0) {
; X32-LABEL: t2:
; X32: ## %bb.0:
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t2:
; X64: ## %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: retq
%v1 = insertelement <2 x float> %a0, float 0.000000e+00, i32 0
%v2 = insertelement <2 x float> %v1, float 0.000000e+00, i32 1
ret <2 x float> %v2
}
@g0 = external global i16
@g1 = external global <4 x i16>
; PR2562
define void @t3() {
; X32-LABEL: t3:
; X32: ## %bb.0:
; X32-NEXT: movl L_g0$non_lazy_ptr, %eax
; X32-NEXT: movl L_g1$non_lazy_ptr, %ecx
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X32-NEXT: movzwl (%eax), %eax
; X32-NEXT: movd %eax, %xmm1
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-NEXT: movq %xmm0, (%ecx)
; X32-NEXT: retl
;
; X64-LABEL: t3:
; X64: ## %bb.0:
; X64-NEXT: movq _g0@{{.*}}(%rip), %rax
; X64-NEXT: movq _g1@{{.*}}(%rip), %rcx
; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-NEXT: movzwl (%rax), %eax
; X64-NEXT: pinsrd $0, %eax, %xmm0
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
load i16, i16* @g0
load <4 x i16>, <4 x i16>* @g1
insertelement <4 x i16> %2, i16 %1, i32 0
store <4 x i16> %3, <4 x i16>* @g1
ret void
}