For now, we will hardcode the result as 0.0 if the input is denormal or 0. That will have the impact the precision. As the fsqrt added belong to the cold path of the cmp+branch, it won't impact the performance for normal inputs for PowerPC, but improve the precision if the input is denormal. Reviewed By: Spatel Differential Revision: https://reviews.llvm.org/D80974
45 lines
1.4 KiB
LLVM
45 lines
1.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx -disable-ppc-vsx-fma-mutation=false | FileCheck %s
|
|
|
|
declare double @llvm.sqrt.f64(double)
|
|
|
|
; Test several VSX FMA mutation opportunities.
|
|
|
|
; This is reasonable transformation since it eliminates extra register copy.
|
|
define double @foo3_fmf(double %a) nounwind {
|
|
; CHECK-LABEL: foo3_fmf:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: xstsqrtdp 0, 1
|
|
; CHECK-NEXT: bc 12, 2, .LBB0_2
|
|
; CHECK-NEXT: # %bb.1:
|
|
; CHECK-NEXT: xsrsqrtedp 0, 1
|
|
; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha
|
|
; CHECK-NEXT: lfs 3, .LCPI0_0@toc@l(3)
|
|
; CHECK-NEXT: addis 3, 2, .LCPI0_1@toc@ha
|
|
; CHECK-NEXT: lfs 4, .LCPI0_1@toc@l(3)
|
|
; CHECK-NEXT: xsmuldp 2, 1, 0
|
|
; CHECK-NEXT: xsmaddmdp 2, 0, 3
|
|
; CHECK-NEXT: xsmuldp 0, 0, 4
|
|
; CHECK-NEXT: xsmuldp 0, 0, 2
|
|
; CHECK-NEXT: xsmuldp 1, 1, 0
|
|
; CHECK-NEXT: xsmaddadp 3, 1, 0
|
|
; CHECK-NEXT: xsmuldp 0, 1, 4
|
|
; CHECK-NEXT: xsmuldp 1, 0, 3
|
|
; CHECK-NEXT: blr
|
|
; CHECK-NEXT: .LBB0_2:
|
|
; CHECK-NEXT: xssqrtdp 1, 1
|
|
; CHECK-NEXT: blr
|
|
%r = call reassoc afn ninf double @llvm.sqrt.f64(double %a)
|
|
ret double %r
|
|
}
|
|
|
|
define double @foo3_safe(double %a) nounwind {
|
|
; CHECK-LABEL: foo3_safe:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: xssqrtdp 1, 1
|
|
; CHECK-NEXT: blr
|
|
%r = call double @llvm.sqrt.f64(double %a)
|
|
ret double %r
|
|
}
|
|
|