VSX introduced some permute instructions that are direct replacements for Altivec ones except they can target all the VSX registers. We have added code generation for most of these but somehow missed the low/hi word merges (XXMRG[LH]W). This caused some additional spills on some large computationally intensive code. This patch simply adds the missed patterns.
22 lines
909 B
LLVM
22 lines
909 B
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown \
|
|
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
|
|
define dso_local void @_Z1jjPiPj() local_unnamed_addr #0 {
|
|
; CHECK-LABEL: _Z1jjPiPj:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: ld r3, 0(r3)
|
|
; CHECK-NEXT: std r3, -16(r1)
|
|
; CHECK-NEXT: addi r3, r1, -16
|
|
; CHECK-NEXT: lxvd2x vs0, 0, r3
|
|
; CHECK-NEXT: xxswapd vs0, vs0
|
|
; CHECK-NEXT: xxmrglw vs0, vs0, vs0
|
|
; CHECK-NEXT: xxswapd vs0, vs0
|
|
; CHECK-NEXT: stxvd2x vs0, 0, r3
|
|
; CHECK-NEXT: blr
|
|
entry:
|
|
%wide.load42 = load <2 x i32>, <2 x i32>* undef, align 4
|
|
%interleaved.vec49 = shufflevector <2 x i32> %wide.load42, <2 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
|
|
store <4 x i32> %interleaved.vec49, <4 x i32>* undef, align 4
|
|
ret void
|
|
}
|