Files
clang-p2996/llvm/test/CodeGen/X86/bfloat.ll
Krzysztof Drewniak 70995a1a33 [ScalarizeMaskedMemIntr] Optimize splat non-constant masks (#104537)
In cases (like the ones added in the tests) where the condition of a
masked load or store is a splat but not a constant (that is, a masked
operation is being used to implement patterns like "load if the current
lane is in-bounds, otherwise return 0"), optimize the 'scalarized' code
to perform an aligned vector load/store if the splat constant is true.

Additionally, take a few steps to preserve aliasing information and
names when nothing is scalarized while I'm here.

As motivation, some LLVM IR users will genatate masked load/store in
cases that map to this kind of predicated operation (where either the
vector is loaded/stored or it isn't) in order to take advantage of
hardware primitives, but on AMDGPU, where we don't have a masked load or
store, this pass would scalarize a load or store that was intended to be
- and can be - vectorized while also introducing expensive branches.

Fixes #104520

Pre-commit tests at #104527
2024-08-16 16:24:25 -05:00

1956 lines
75 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,BF16,AVXNC
define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
; X86-LABEL: add:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movzwl (%edx), %edx
; X86-NEXT: shll $16, %edx
; X86-NEXT: vmovd %edx, %xmm0
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: shll $16, %ecx
; X86-NEXT: vmovd %ecx, %xmm1
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; X86-NEXT: vpextrw $0, %xmm0, (%eax)
; X86-NEXT: retl
;
; SSE2-LABEL: add:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movq %rdx, %rbx
; SSE2-NEXT: movzwl (%rsi), %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rbx)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
; F16-LABEL: add:
; F16: # %bb.0:
; F16-NEXT: movzwl (%rsi), %eax
; F16-NEXT: shll $16, %eax
; F16-NEXT: vmovd %eax, %xmm0
; F16-NEXT: movzwl (%rdi), %eax
; F16-NEXT: shll $16, %eax
; F16-NEXT: vmovd %eax, %xmm1
; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; F16-NEXT: vpextrw $0, %xmm0, (%rdx)
; F16-NEXT: retq
;
; AVXNC-LABEL: add:
; AVXNC: # %bb.0:
; AVXNC-NEXT: movzwl (%rsi), %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: movzwl (%rdi), %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; AVXNC-NEXT: vpextrw $0, %xmm0, (%rdx)
; AVXNC-NEXT: retq
%a = load bfloat, ptr %pa
%b = load bfloat, ptr %pb
%add = fadd bfloat %a, %b
store bfloat %add, ptr %pc
ret void
}
define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; X86-LABEL: add2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm1
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: vmovw %eax, %xmm0
; X86-NEXT: retl
;
; SSE2-LABEL: add2:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: pextrw $0, %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
; FP16-LABEL: add2:
; FP16: # %bb.0:
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: vmovw %xmm1, %ecx
; FP16-NEXT: shll $16, %ecx
; FP16-NEXT: vmovd %ecx, %xmm0
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: vmovw %eax, %xmm0
; FP16-NEXT: retq
;
; AVXNC-LABEL: add2:
; AVXNC: # %bb.0:
; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
; AVXNC-NEXT: vpextrw $0, %xmm1, %ecx
; AVXNC-NEXT: shll $16, %ecx
; AVXNC-NEXT: vmovd %ecx, %xmm0
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; AVXNC-NEXT: vmovd %xmm0, %eax
; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVXNC-NEXT: retq
%add = fadd bfloat %a, %b
ret bfloat %add
}
define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; X86-LABEL: add_double:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $16, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovw %xmm0, %edi
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: shll $16, %edi
; X86-NEXT: vmovd %edi, %xmm1
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, (%esi)
; X86-NEXT: addl $16, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; SSE2-LABEL: add_double:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %r14
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movq %rdx, %rbx
; SSE2-NEXT: movq %rsi, %r14
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movd %ebp, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
; SSE2-NEXT: movsd %xmm0, (%rbx)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r14
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
; FP16-LABEL: add_double:
; FP16: # %bb.0:
; FP16-NEXT: pushq %rbp
; FP16-NEXT: pushq %r14
; FP16-NEXT: pushq %rbx
; FP16-NEXT: movq %rdx, %rbx
; FP16-NEXT: movq %rsi, %r14
; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovw %xmm0, %ebp
; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: shll $16, %ebp
; FP16-NEXT: vmovd %ebp, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; FP16-NEXT: vmovsd %xmm0, (%rbx)
; FP16-NEXT: popq %rbx
; FP16-NEXT: popq %r14
; FP16-NEXT: popq %rbp
; FP16-NEXT: retq
;
; AVXNC-LABEL: add_double:
; AVXNC: # %bb.0:
; AVXNC-NEXT: pushq %rbp
; AVXNC-NEXT: pushq %r14
; AVXNC-NEXT: pushq %rbx
; AVXNC-NEXT: movq %rdx, %rbx
; AVXNC-NEXT: movq %rsi, %r14
; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: shll $16, %ebp
; AVXNC-NEXT: vmovd %ebp, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; AVXNC-NEXT: vmovd %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVXNC-NEXT: vmovsd %xmm0, (%rbx)
; AVXNC-NEXT: popq %rbx
; AVXNC-NEXT: popq %r14
; AVXNC-NEXT: popq %rbp
; AVXNC-NEXT: retq
%la = load double, ptr %pa
%a = fptrunc double %la to bfloat
%lb = load double, ptr %pb
%b = fptrunc double %lb to bfloat
%add = fadd bfloat %a, %b
%dadd = fpext bfloat %add to double
store double %dadd, ptr %pc
ret void
}
define double @add_double2(double %da, double %db) nounwind {
; X86-LABEL: add_double2:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: subl $24, %esp
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovw %xmm0, %esi
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: shll $16, %esi
; X86-NEXT: vmovd %esi, %xmm1
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: fldl {{[0-9]+}}(%esp)
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; SSE2-LABEL: add_double2:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: subq $16, %rsp
; SSE2-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd %ebx, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
; SSE2-NEXT: addq $16, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
; FP16-LABEL: add_double2:
; FP16: # %bb.0:
; FP16-NEXT: pushq %rbx
; FP16-NEXT: subq $16, %rsp
; FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovw %xmm0, %ebx
; FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; FP16-NEXT: # xmm0 = mem[0],zero
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: shll $16, %ebx
; FP16-NEXT: vmovd %ebx, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; FP16-NEXT: addq $16, %rsp
; FP16-NEXT: popq %rbx
; FP16-NEXT: retq
;
; AVXNC-LABEL: add_double2:
; AVXNC: # %bb.0:
; AVXNC-NEXT: pushq %rbx
; AVXNC-NEXT: subq $16, %rsp
; AVXNC-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
; AVXNC-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
; AVXNC-NEXT: # xmm0 = mem[0],zero
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: shll $16, %ebx
; AVXNC-NEXT: vmovd %ebx, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; AVXNC-NEXT: vmovd %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVXNC-NEXT: addq $16, %rsp
; AVXNC-NEXT: popq %rbx
; AVXNC-NEXT: retq
%a = fptrunc double %da to bfloat
%b = fptrunc double %db to bfloat
%add = fadd bfloat %a, %b
%dadd = fpext bfloat %add to double
ret double %dadd
}
define void @add_constant(ptr %pa, ptr %pc) nounwind {
; X86-LABEL: add_constant:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: shll $16, %ecx
; X86-NEXT: vmovd %ecx, %xmm0
; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; X86-NEXT: vpextrw $0, %xmm0, (%eax)
; X86-NEXT: retl
;
; SSE2-LABEL: add_constant:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movq %rsi, %rbx
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rbx)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
; F16-LABEL: add_constant:
; F16: # %bb.0:
; F16-NEXT: movzwl (%rdi), %eax
; F16-NEXT: shll $16, %eax
; F16-NEXT: vmovd %eax, %xmm0
; F16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; F16-NEXT: vpextrw $0, %xmm0, (%rsi)
; F16-NEXT: retq
;
; AVXNC-LABEL: add_constant:
; AVXNC: # %bb.0:
; AVXNC-NEXT: movzwl (%rdi), %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; AVXNC-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVXNC-NEXT: retq
%a = load bfloat, ptr %pa
%add = fadd bfloat %a, 1.0
store bfloat %add, ptr %pc
ret void
}
define bfloat @add_constant2(bfloat %a) nounwind {
; X86-LABEL: add_constant2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: vmovw %eax, %xmm0
; X86-NEXT: retl
;
; SSE2-LABEL: add_constant2:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
; FP16-LABEL: add_constant2:
; FP16: # %bb.0:
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: vmovw %eax, %xmm0
; FP16-NEXT: retq
;
; AVXNC-LABEL: add_constant2:
; AVXNC: # %bb.0:
; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; AVXNC-NEXT: vmovd %xmm0, %eax
; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVXNC-NEXT: retq
%add = fadd bfloat %a, 1.0
ret bfloat %add
}
define void @store_constant(ptr %pc) nounwind {
; X86-LABEL: store_constant:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movw $16256, (%eax) # imm = 0x3F80
; X86-NEXT: retl
;
; CHECK-LABEL: store_constant:
; CHECK: # %bb.0:
; CHECK-NEXT: movw $16256, (%rdi) # imm = 0x3F80
; CHECK-NEXT: retq
store bfloat 1.0, ptr %pc
ret void
}
define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
; X86-LABEL: fold_ext_trunc:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: movw %cx, (%eax)
; X86-NEXT: retl
;
; CHECK-LABEL: fold_ext_trunc:
; CHECK: # %bb.0:
; CHECK-NEXT: movzwl (%rdi), %eax
; CHECK-NEXT: movw %ax, (%rsi)
; CHECK-NEXT: retq
%a = load bfloat, ptr %pa
%ext = fpext bfloat %a to float
%trunc = fptrunc float %ext to bfloat
store bfloat %trunc, ptr %pc
ret void
}
define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
; X86-LABEL: fold_ext_trunc2:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: retl
;
; CHECK-LABEL: fold_ext_trunc2:
; CHECK: # %bb.0:
; CHECK-NEXT: retq
%ext = fpext bfloat %a to float
%trunc = fptrunc float %ext to bfloat
ret bfloat %trunc
}
define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; X86-LABEL: addv:
; X86: # %bb.0:
; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X86-NEXT: vpslld $16, %ymm1, %ymm1
; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X86-NEXT: vpslld $16, %ymm0, %ymm0
; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; SSE2-LABEL: addv:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %r15
; SSE2-NEXT: pushq %r14
; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: subq $56, %rsp
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: shrq $48, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %xmm1, %rdx
; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rdx, %rax
; SSE2-NEXT: shrq $48, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: shrq $32, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rdx, %rax
; SSE2-NEXT: shrq $32, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: movq %xmm0, %r15
; SSE2-NEXT: movq %r15, %rbx
; SSE2-NEXT: shrq $48, %rbx
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1]
; SSE2-NEXT: movq %xmm1, %r14
; SSE2-NEXT: movq %r14, %rbp
; SSE2-NEXT: shrq $48, %rbp
; SSE2-NEXT: movq %r15, %r12
; SSE2-NEXT: shrq $32, %r12
; SSE2-NEXT: movq %r14, %r13
; SSE2-NEXT: shrq $32, %r13
; SSE2-NEXT: movl %r14d, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movl %r15d, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE2-NEXT: andl $-65536, %r14d # imm = 0xFFFF0000
; SSE2-NEXT: movd %r14d, %xmm1
; SSE2-NEXT: andl $-65536, %r15d # imm = 0xFFFF0000
; SSE2-NEXT: movd %r15d, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %r15d
; SSE2-NEXT: shll $16, %r15d
; SSE2-NEXT: addl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
; SSE2-NEXT: shll $16, %r13d
; SSE2-NEXT: movd %r13d, %xmm1
; SSE2-NEXT: shll $16, %r12d
; SSE2-NEXT: movd %r12d, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movd %ebp, %xmm1
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd %ebx, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: orl %r14d, %ebx
; SSE2-NEXT: shlq $32, %rbx
; SSE2-NEXT: orq %r15, %rbx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
; SSE2-NEXT: movl %r15d, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; SSE2-NEXT: movl %r14d, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %ebp
; SSE2-NEXT: movq %r15, %rax
; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movq %r14, %rax
; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %r14d
; SSE2-NEXT: shll $16, %r14d
; SSE2-NEXT: orl %ebp, %r14d
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %ebp
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: orl %ebp, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: movq %rbx, %xmm1
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: addq $56, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
; SSE2-NEXT: popq %r14
; SSE2-NEXT: popq %r15
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
; F16-LABEL: addv:
; F16: # %bb.0:
; F16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; F16-NEXT: vpslld $16, %ymm1, %ymm1
; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; F16-NEXT: vpslld $16, %ymm0, %ymm0
; F16-NEXT: vaddps %ymm1, %ymm0, %ymm0
; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; F16-NEXT: vzeroupper
; F16-NEXT: retq
;
; AVXNC-LABEL: addv:
; AVXNC: # %bb.0:
; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVXNC-NEXT: vpslld $16, %ymm1, %ymm1
; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVXNC-NEXT: vpslld $16, %ymm0, %ymm0
; AVXNC-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: retq
%add = fadd <8 x bfloat> %a, %b
ret <8 x bfloat> %add
}
define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
; X86-LABEL: pr62997:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT: retl
;
; SSE2-LABEL: pr62997:
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
; BF16-LABEL: pr62997:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $0, %xmm0, %eax
; BF16-NEXT: vpextrw $0, %xmm1, %ecx
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
; BF16-NEXT: retq
;
; FP16-LABEL: pr62997:
; FP16: # %bb.0:
; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; FP16-NEXT: retq
%1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0
%2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1
ret <2 x bfloat> %2
}
define <32 x bfloat> @pr63017() {
; X86-LABEL: pr63017:
; X86: # %bb.0:
; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X86-NEXT: retl
;
; SSE2-LABEL: pr63017:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: xorps %xmm3, %xmm3
; SSE2-NEXT: retq
;
; F16-LABEL: pr63017:
; F16: # %bb.0:
; F16-NEXT: vxorps %xmm0, %xmm0, %xmm0
; F16-NEXT: retq
;
; AVXNC-LABEL: pr63017:
; AVXNC: # %bb.0:
; AVXNC-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVXNC-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVXNC-NEXT: retq
ret <32 x bfloat> zeroinitializer
}
define <32 x bfloat> @pr63017_2() nounwind {
; X86-LABEL: pr63017_2:
; X86: # %bb.0:
; X86-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
; X86-NEXT: retl
;
; SSE2-LABEL: pr63017_2:
; SSE2: # %bb.0:
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: testb %al, %al
; SSE2-NEXT: jne .LBB12_1
; SSE2-NEXT: # %bb.2: # %cond.load
; SSE2-NEXT: movzwl (%rax), %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: jmp .LBB12_3
; SSE2-NEXT: .LBB12_1:
; SSE2-NEXT: movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
; SSE2-NEXT: .LBB12_3:
; SSE2-NEXT: pushq %r14
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: subq $88, %rsp
; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE2-NEXT: addq $88, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r14
; SSE2-NEXT: retq
;
; FP16-LABEL: pr63017_2:
; FP16: # %bb.0:
; FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
; FP16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
; FP16-NEXT: retq
;
; AVXNC-LABEL: pr63017_2:
; AVXNC: # %bb.0:
; AVXNC-NEXT: vbroadcastss {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
; AVXNC-NEXT: xorl %eax, %eax
; AVXNC-NEXT: testb %al, %al
; AVXNC-NEXT: jne .LBB12_2
; AVXNC-NEXT: # %bb.1: # %cond.load
; AVXNC-NEXT: vmovups (%rax), %ymm0
; AVXNC-NEXT: .LBB12_2:
; AVXNC-NEXT: vmovaps %ymm0, %ymm1
; AVXNC-NEXT: retq
%1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
ret <32 x bfloat> %1
}
define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
; X86-LABEL: pr62997_3:
; X86: # %bb.0:
; X86-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
; X86-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; X86-NEXT: retl
;
; SSE2-LABEL: pr62997_3:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; SSE2-NEXT: andq %rax, %rcx
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: pextrw $0, %xmm4, %edx
; SSE2-NEXT: shll $16, %edx
; SSE2-NEXT: orl %eax, %edx
; SSE2-NEXT: orq %rcx, %rdx
; SSE2-NEXT: movq %rdx, %xmm4
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE2-NEXT: retq
;
; FP16-LABEL: pr62997_3:
; FP16: # %bb.0:
; FP16-NEXT: vmovw %xmm1, %eax
; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1
; FP16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; FP16-NEXT: retq
;
; AVXNC-LABEL: pr62997_3:
; AVXNC: # %bb.0:
; AVXNC-NEXT: vpextrw $0, %xmm2, %eax
; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm2
; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVXNC-NEXT: retq
%3 = insertelement <32 x bfloat> %0, bfloat %1, i64 1
ret <32 x bfloat> %3
}
declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)
define <4 x float> @pr64460_1(<4 x bfloat> %a) {
; X86-LABEL: pr64460_1:
; X86: # %bb.0:
; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-NEXT: retl
;
; SSE2-LABEL: pr64460_1:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: pr64460_1:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX-NEXT: retq
%b = fpext <4 x bfloat> %a to <4 x float>
ret <4 x float> %b
}
define <8 x float> @pr64460_2(<8 x bfloat> %a) {
; X86-LABEL: pr64460_2:
; X86: # %bb.0:
; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X86-NEXT: vpslld $16, %ymm0, %ymm0
; X86-NEXT: retl
;
; SSE2-LABEL: pr64460_2:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: pr64460_2:
; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpslld $16, %ymm0, %ymm0
; AVX-NEXT: retq
%b = fpext <8 x bfloat> %a to <8 x float>
ret <8 x float> %b
}
define <16 x float> @pr64460_3(<16 x bfloat> %a) {
; X86-LABEL: pr64460_3:
; X86: # %bb.0:
; X86-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; X86-NEXT: vpslld $16, %zmm0, %zmm0
; X86-NEXT: retl
;
; SSE2-LABEL: pr64460_3:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; SSE2-NEXT: movdqa %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: retq
;
; F16-LABEL: pr64460_3:
; F16: # %bb.0:
; F16-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; F16-NEXT: vpslld $16, %zmm0, %zmm0
; F16-NEXT: retq
;
; AVXNC-LABEL: pr64460_3:
; AVXNC: # %bb.0:
; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVXNC-NEXT: vpslld $16, %ymm1, %ymm2
; AVXNC-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVXNC-NEXT: vpslld $16, %ymm0, %ymm1
; AVXNC-NEXT: vmovdqa %ymm2, %ymm0
; AVXNC-NEXT: retq
%b = fpext <16 x bfloat> %a to <16 x float>
ret <16 x float> %b
}
define <8 x double> @pr64460_4(<8 x bfloat> %a) {
; X86-LABEL: pr64460_4:
; X86: # %bb.0:
; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X86-NEXT: vpslld $16, %ymm0, %ymm0
; X86-NEXT: vcvtps2pd %ymm0, %zmm0
; X86-NEXT: retl
;
; SSE2-LABEL: pr64460_4:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: cvtps2pd %xmm1, %xmm4
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSE2-NEXT: cvtps2pd %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: cvtps2pd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE2-NEXT: cvtps2pd %xmm0, %xmm3
; SSE2-NEXT: movaps %xmm4, %xmm0
; SSE2-NEXT: retq
;
; F16-LABEL: pr64460_4:
; F16: # %bb.0:
; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; F16-NEXT: vpslld $16, %ymm0, %ymm0
; F16-NEXT: vcvtps2pd %ymm0, %zmm0
; F16-NEXT: retq
;
; AVXNC-LABEL: pr64460_4:
; AVXNC: # %bb.0:
; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVXNC-NEXT: vpslld $16, %ymm0, %ymm1
; AVXNC-NEXT: vcvtps2pd %xmm1, %ymm0
; AVXNC-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVXNC-NEXT: vcvtps2pd %xmm1, %ymm1
; AVXNC-NEXT: retq
%b = fpext <8 x bfloat> %a to <8 x double>
ret <8 x double> %b
}
define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
; X86-LABEL: fptrunc_v4f32:
; X86: # %bb.0:
; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; SSE2-LABEL: fptrunc_v4f32:
; SSE2: # %bb.0:
; SSE2-NEXT: subq $72, %rsp
; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: addq $72, %rsp
; SSE2-NEXT: retq
;
; F16-LABEL: fptrunc_v4f32:
; F16: # %bb.0:
; F16-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; F16-NEXT: vzeroupper
; F16-NEXT: retq
;
; AVXNC-LABEL: fptrunc_v4f32:
; AVXNC: # %bb.0:
; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: retq
%b = fptrunc <4 x float> %a to <4 x bfloat>
ret <4 x bfloat> %b
}
define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
; X86-LABEL: fptrunc_v8f32:
; X86: # %bb.0:
; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; SSE2-LABEL: fptrunc_v8f32:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %r14
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: subq $32, %rsp
; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %ebx
; SSE2-NEXT: orl %ebp, %ebx
; SSE2-NEXT: shlq $32, %rbx
; SSE2-NEXT: orq %r14, %rbx
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebp, %r14d
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebp, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: movq %rbx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: addq $32, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r14
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
; F16-LABEL: fptrunc_v8f32:
; F16: # %bb.0:
; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; F16-NEXT: vzeroupper
; F16-NEXT: retq
;
; AVXNC-LABEL: fptrunc_v8f32:
; AVXNC: # %bb.0:
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: retq
%b = fptrunc <8 x float> %a to <8 x bfloat>
ret <8 x bfloat> %b
}
define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
; X86-LABEL: fptrunc_v16f32:
; X86: # %bb.0:
; X86-NEXT: vcvtneps2bf16 %zmm0, %ymm0
; X86-NEXT: retl
;
; SSE2-LABEL: fptrunc_v16f32:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %r15
; SSE2-NEXT: pushq %r14
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: subq $64, %rsp
; SSE2-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %ebx
; SSE2-NEXT: orl %ebp, %ebx
; SSE2-NEXT: shlq $32, %rbx
; SSE2-NEXT: orq %r14, %rbx
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r15d
; SSE2-NEXT: orl %ebp, %r15d
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebp, %r14d
; SSE2-NEXT: shlq $32, %r14
; SSE2-NEXT: orq %r15, %r14
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r12d
; SSE2-NEXT: orl %ebp, %r12d
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r15d
; SSE2-NEXT: orl %ebp, %r15d
; SSE2-NEXT: shlq $32, %r15
; SSE2-NEXT: orq %r12, %r15
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r12d
; SSE2-NEXT: orl %ebp, %r12d
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebp, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r12, %rax
; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: movq %r15, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: movq %r14, %xmm2
; SSE2-NEXT: movq %rbx, %xmm1
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: addq $64, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r14
; SSE2-NEXT: popq %r15
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
; F16-LABEL: fptrunc_v16f32:
; F16: # %bb.0:
; F16-NEXT: vcvtneps2bf16 %zmm0, %ymm0
; F16-NEXT: retq
;
; AVXNC-LABEL: fptrunc_v16f32:
; AVXNC: # %bb.0:
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVXNC-NEXT: retq
%b = fptrunc <16 x float> %a to <16 x bfloat>
ret <16 x bfloat> %b
}
define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; X86-LABEL: fptrunc_v8f64:
; X86: # %bb.0:
; X86-NEXT: subl $204, %esp
; X86-NEXT: vmovups %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill
; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovlps %xmm0, (%esp)
; X86-NEXT: vzeroupper
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vmovhps %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
; X86-NEXT: vmovlps %xmm0, (%esp)
; X86-NEXT: vzeroupper
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
; X86-NEXT: vmovhps %xmm0, (%esp)
; X86-NEXT: vzeroupper
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
; X86-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovlps %xmm0, (%esp)
; X86-NEXT: vzeroupper
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vmovhps %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
; X86-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovlps %xmm0, (%esp)
; X86-NEXT: vzeroupper
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vmovhps %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
; X86-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X86-NEXT: addl $204, %esp
; X86-NEXT: retl
;
; SSE2-LABEL: fptrunc_v8f64:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %r14
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: subq $64, %rsp
; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %ebx
; SSE2-NEXT: orl %ebp, %ebx
; SSE2-NEXT: shlq $32, %rbx
; SSE2-NEXT: orq %r14, %rbx
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebp, %r14d
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncdfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebp, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: movq %rbx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: addq $64, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r14
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
; FP16-LABEL: fptrunc_v8f64:
; FP16: # %bb.0:
; FP16-NEXT: subq $184, %rsp
; FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; FP16-NEXT: vextractf128 $1, %ymm0, %xmm0
; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; FP16-NEXT: vzeroupper
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; FP16-NEXT: # xmm0 = mem[1,0]
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; FP16-NEXT: vzeroupper
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; FP16-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; FP16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; FP16-NEXT: vzeroupper
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; FP16-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; FP16-NEXT: vzeroupper
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; FP16-NEXT: callq __truncdfbf2@PLT
; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; FP16-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; FP16-NEXT: addq $184, %rsp
; FP16-NEXT: retq
;
; AVXNC-LABEL: fptrunc_v8f64:
; AVXNC: # %bb.0:
; AVXNC-NEXT: pushq %rbp
; AVXNC-NEXT: pushq %r15
; AVXNC-NEXT: pushq %r14
; AVXNC-NEXT: pushq %r13
; AVXNC-NEXT: pushq %r12
; AVXNC-NEXT: pushq %rbx
; AVXNC-NEXT: subq $168, %rsp
; AVXNC-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVXNC-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
; AVXNC-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %r15d
; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
; AVXNC-NEXT: vmovd %ebx, %xmm0
; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
; AVXNC-NEXT: addq $168, %rsp
; AVXNC-NEXT: popq %rbx
; AVXNC-NEXT: popq %r12
; AVXNC-NEXT: popq %r13
; AVXNC-NEXT: popq %r14
; AVXNC-NEXT: popq %r15
; AVXNC-NEXT: popq %rbp
; AVXNC-NEXT: retq
%b = fptrunc <8 x double> %a to <8 x bfloat>
ret <8 x bfloat> %b
}
define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
; X86-LABEL: test_v8bf16_v32bf16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X86-NEXT: retl
;
; SSE2-LABEL: test_v8bf16_v32bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: movaps %xmm0, %xmm3
; SSE2-NEXT: retq
;
; F16-LABEL: test_v8bf16_v32bf16:
; F16: # %bb.0:
; F16-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; F16-NEXT: retq
;
; AVXNC-LABEL: test_v8bf16_v32bf16:
; AVXNC: # %bb.0:
; AVXNC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVXNC-NEXT: vmovaps %ymm0, %ymm1
; AVXNC-NEXT: retq
%2 = load <8 x bfloat>, ptr %0, align 16
%3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <32 x bfloat> %3
}
define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
; X86-LABEL: concat_v8bf16:
; X86: # %bb.0:
; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; SSE2-LABEL: concat_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: retq
;
; AVX-LABEL: concat_v8bf16:
; AVX: # %bb.0:
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
%a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x bfloat> %a
}
define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
; X86-LABEL: extract_v32bf16_v8bf16:
; X86: # %bb.0:
; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; SSE2-LABEL: extract_v32bf16_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm1, %eax
; SSE2-NEXT: pextrw $1, %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: pextrw $2, %xmm1, %eax
; SSE2-NEXT: pextrw $3, %xmm1, %edx
; SSE2-NEXT: shll $16, %edx
; SSE2-NEXT: orl %eax, %edx
; SSE2-NEXT: shlq $32, %rdx
; SSE2-NEXT: orq %rcx, %rdx
; SSE2-NEXT: pextrw $4, %xmm1, %eax
; SSE2-NEXT: pextrw $5, %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: pextrw $6, %xmm1, %eax
; SSE2-NEXT: pextrw $7, %xmm1, %esi
; SSE2-NEXT: shll $16, %esi
; SSE2-NEXT: orl %eax, %esi
; SSE2-NEXT: shlq $32, %rsi
; SSE2-NEXT: orq %rcx, %rsi
; SSE2-NEXT: movq %rsi, %xmm1
; SSE2-NEXT: movq %rdx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; AVX-LABEL: extract_v32bf16_v8bf16:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%a = shufflevector <32 x bfloat> %x, <32 x bfloat> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <8 x bfloat> %a
}
define <16 x bfloat> @concat_zero_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
; X86-LABEL: concat_zero_v8bf16:
; X86: # %bb.0:
; X86-NEXT: vmovaps %xmm0, %xmm0
; X86-NEXT: retl
;
; SSE2-LABEL: concat_zero_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: retq
;
; AVX-LABEL: concat_zero_v8bf16:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <8 x bfloat> %x, <8 x bfloat> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x bfloat> %a
}
define <16 x bfloat> @concat_dup_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
; X86-LABEL: concat_dup_v8bf16:
; X86: # %bb.0:
; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; SSE2-LABEL: concat_dup_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; AVX-LABEL: concat_dup_v8bf16:
; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
%a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x bfloat> %a
}
define float @trunc_ext(float %a) nounwind {
; X86-LABEL: trunc_ext:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vmovd %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
; X86-NEXT: retl
;
; SSE2-LABEL: trunc_ext:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: callq __truncsfbf2@PLT
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
; FP16-LABEL: trunc_ext:
; FP16: # %bb.0:
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: retq
;
; AVXNC-LABEL: trunc_ext:
; AVXNC: # %bb.0:
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; AVXNC-NEXT: vmovd %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: retq
%b = fptrunc float %a to bfloat
%c = fpext bfloat %b to float
ret float %c
}
define void @PR92471(ptr %0, ptr %1) nounwind {
; X86-LABEL: PR92471:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vpinsrd $1, 4(%ecx), %xmm0, %xmm0
; X86-NEXT: vpinsrd $2, 8(%ecx), %xmm0, %xmm0
; X86-NEXT: vpinsrw $6, 12(%ecx), %xmm0, %xmm0
; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X86-NEXT: vpslld $16, %ymm0, %ymm0
; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-NEXT: vpextrd $2, %xmm1, 24(%eax)
; X86-NEXT: vpextrd $1, %xmm1, 20(%eax)
; X86-NEXT: vmovd %xmm1, 16(%eax)
; X86-NEXT: vmovdqu %xmm0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; SSE2-LABEL: PR92471:
; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: pinsrw $2, 12(%rdi), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: movdqu %xmm2, (%rsi)
; SSE2-NEXT: movq %xmm3, 16(%rsi)
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 24(%rsi)
; SSE2-NEXT: retq
;
; AVX-LABEL: PR92471:
; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm0
; AVX-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm0
; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpslld $16, %ymm0, %ymm0
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX-NEXT: vpextrd $2, %xmm1, 24(%rsi)
; AVX-NEXT: vmovq %xmm1, 16(%rsi)
; AVX-NEXT: vmovdqu %xmm0, (%rsi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%3 = load <7 x bfloat>, ptr %0, align 2
%4 = fpext <7 x bfloat> %3 to <7 x float>
store <7 x float> %4, ptr %1, align 4
ret void
}