Files
clang-p2996/llvm/test/CodeGen/X86/haddsub-broadcast.ll
Simon Pilgrim e9f9467da0 [X86] X86FixupInstTunings - add VPERMILPDri -> VSHUFPDrri mapping
Similar to the original VPERMILPSri -> VSHUFPSrri mapping added in D143787, replacing VPERMILPDri -> VSHUFPDrri should never be any slower and saves an encoding byte.

The sibling VPERMILPDmi -> VPSHUFDmi mapping is trickier as we need the same shuffle mask in every lane (and it needs to be adjusted) - I haven't attempted that yet but we can investigate it in the future if there's interest.

Fixes #61060

Differential Revision: https://reviews.llvm.org/D148999
2023-04-23 11:48:50 +01:00

22 lines
860 B
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-- -mattr=avx2 | FileCheck %s
; The broadcast node takes a vector operand as input and changes its length.
define <4 x double> @PR43402(i64 %x) {
; CHECK-LABEL: PR43402:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; CHECK-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
; CHECK-NEXT: retl
%conv = uitofp i64 %x to double
%t2 = insertelement <4 x double> undef, double %conv, i32 0
%t3 = shufflevector <4 x double> %t2, <4 x double> undef, <4 x i32> zeroinitializer
ret <4 x double> %t3
}