v8f16 is a legal type but promoting to v16f16 would result in an illegal type. Let's legalize these by a combination of splitting+promoting resulting in a pair of v4f16. Also, we were being overly cautious with different v4f16 nodes. Mark more of them safe to promote to v4f32.
622 lines
24 KiB
LLVM
622 lines
24 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
|
|
; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
|
|
; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
|
|
; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
|
|
|
|
define float @mul_HalfS(<2 x float> %bin.rdx) {
|
|
; CHECK-SD-LABEL: mul_HalfS:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: mul_HalfS:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
|
|
; CHECK-GI-NEXT: mov s1, v0.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
|
|
ret float %r
|
|
}
|
|
|
|
define half @mul_HalfH(<4 x half> %bin.rdx) {
|
|
; CHECK-SD-NOFP16-LABEL: mul_HalfH:
|
|
; CHECK-SD-NOFP16: // %bb.0:
|
|
; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
|
|
; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fmul s1, s2, s1
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[2]
|
|
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[3]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s1, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: ret
|
|
;
|
|
; CHECK-SD-FP16-LABEL: mul_HalfH:
|
|
; CHECK-SD-FP16: // %bb.0:
|
|
; CHECK-SD-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
|
|
; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1]
|
|
; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2]
|
|
; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3]
|
|
; CHECK-SD-FP16-NEXT: ret
|
|
;
|
|
; CHECK-GI-NOFP16-LABEL: mul_HalfH:
|
|
; CHECK-GI-NOFP16: // %bb.0:
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
|
|
; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1]
|
|
; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
|
|
; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-GI-NOFP16-NEXT: ret
|
|
;
|
|
; CHECK-GI-FP16-LABEL: mul_HalfH:
|
|
; CHECK-GI-FP16: // %bb.0:
|
|
; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
|
|
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
|
|
; CHECK-GI-FP16-NEXT: mov h2, v0.h[2]
|
|
; CHECK-GI-FP16-NEXT: mov h3, v0.h[3]
|
|
; CHECK-GI-FP16-NEXT: fmul h0, h0, h1
|
|
; CHECK-GI-FP16-NEXT: fmul h1, h2, h3
|
|
; CHECK-GI-FP16-NEXT: fmul h0, h0, h1
|
|
; CHECK-GI-FP16-NEXT: ret
|
|
%r = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
|
|
ret half %r
|
|
}
|
|
|
|
|
|
define half @mul_H(<8 x half> %bin.rdx) {
|
|
; CHECK-SD-NOFP16-LABEL: mul_H:
|
|
; CHECK-SD-NOFP16: // %bb.0:
|
|
; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fmul s1, s2, s1
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[2]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[3]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[4]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[5]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[6]
|
|
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s1, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: ret
|
|
;
|
|
; CHECK-SD-FP16-LABEL: mul_H:
|
|
; CHECK-SD-FP16: // %bb.0:
|
|
; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
|
|
; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1]
|
|
; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2]
|
|
; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3]
|
|
; CHECK-SD-FP16-NEXT: ret
|
|
;
|
|
; CHECK-GI-NOFP16-LABEL: mul_H:
|
|
; CHECK-GI-NOFP16: // %bb.0:
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
|
|
; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v1.4s, v0.4s
|
|
; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1]
|
|
; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
|
|
; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-GI-NOFP16-NEXT: ret
|
|
;
|
|
; CHECK-GI-FP16-LABEL: mul_H:
|
|
; CHECK-GI-FP16: // %bb.0:
|
|
; CHECK-GI-FP16-NEXT: mov d1, v0.d[1]
|
|
; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
|
|
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
|
|
; CHECK-GI-FP16-NEXT: mov h2, v0.h[2]
|
|
; CHECK-GI-FP16-NEXT: mov h3, v0.h[3]
|
|
; CHECK-GI-FP16-NEXT: fmul h0, h0, h1
|
|
; CHECK-GI-FP16-NEXT: fmul h1, h2, h3
|
|
; CHECK-GI-FP16-NEXT: fmul h0, h0, h1
|
|
; CHECK-GI-FP16-NEXT: ret
|
|
%r = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx)
|
|
ret half %r
|
|
}
|
|
|
|
define float @mul_S(<4 x float> %bin.rdx) {
|
|
; CHECK-SD-LABEL: mul_S:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: mul_S:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d1, v0.d[1]
|
|
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-GI-NEXT: mov s1, v0.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
|
|
ret float %r
|
|
}
|
|
|
|
define double @mul_D(<2 x double> %bin.rdx) {
|
|
; CHECK-LABEL: mul_D:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmul d0, d0, v0.d[1]
|
|
; CHECK-NEXT: ret
|
|
%r = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %bin.rdx)
|
|
ret double %r
|
|
}
|
|
|
|
define half @mul_2H(<16 x half> %bin.rdx) {
|
|
; CHECK-SD-NOFP16-LABEL: mul_2H:
|
|
; CHECK-SD-NOFP16: // %bb.0:
|
|
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
|
|
; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
|
|
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
|
|
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
|
|
; CHECK-SD-NOFP16-NEXT: fmul v2.4s, v3.4s, v2.4s
|
|
; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s
|
|
; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s
|
|
; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s2, s0
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6]
|
|
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: ret
|
|
;
|
|
; CHECK-SD-FP16-LABEL: mul_2H:
|
|
; CHECK-SD-FP16: // %bb.0:
|
|
; CHECK-SD-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h
|
|
; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
|
|
; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1]
|
|
; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2]
|
|
; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3]
|
|
; CHECK-SD-FP16-NEXT: ret
|
|
;
|
|
; CHECK-GI-NOFP16-LABEL: mul_2H:
|
|
; CHECK-GI-NOFP16: // %bb.0:
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
|
|
; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v2.4s, v0.4s
|
|
; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v1.4s
|
|
; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1]
|
|
; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
|
|
; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-GI-NOFP16-NEXT: ret
|
|
;
|
|
; CHECK-GI-FP16-LABEL: mul_2H:
|
|
; CHECK-GI-FP16: // %bb.0:
|
|
; CHECK-GI-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h
|
|
; CHECK-GI-FP16-NEXT: mov d1, v0.d[1]
|
|
; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
|
|
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
|
|
; CHECK-GI-FP16-NEXT: mov h2, v0.h[2]
|
|
; CHECK-GI-FP16-NEXT: mov h3, v0.h[3]
|
|
; CHECK-GI-FP16-NEXT: fmul h0, h0, h1
|
|
; CHECK-GI-FP16-NEXT: fmul h1, h2, h3
|
|
; CHECK-GI-FP16-NEXT: fmul h0, h0, h1
|
|
; CHECK-GI-FP16-NEXT: ret
|
|
%r = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx)
|
|
ret half %r
|
|
}
|
|
|
|
define float @mul_2S(<8 x float> %bin.rdx) {
|
|
; CHECK-SD-LABEL: mul_2S:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: mul_2S:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: mov d1, v0.d[1]
|
|
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-GI-NEXT: mov s1, v0.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
|
|
ret float %r
|
|
}
|
|
|
|
define double @mul_2D(<4 x double> %bin.rdx) {
|
|
; CHECK-LABEL: mul_2D:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d
|
|
; CHECK-NEXT: fmul d0, d0, v0.d[1]
|
|
; CHECK-NEXT: ret
|
|
%r = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %bin.rdx)
|
|
ret double %r
|
|
}
|
|
|
|
; added at least one test where the start value is not 1.0.
|
|
define float @mul_S_init_42(<4 x float> %bin.rdx) {
|
|
; CHECK-SD-LABEL: mul_S_init_42:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: mov w8, #1109917696 // =0x42280000
|
|
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: fmov s1, w8
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: fmul s0, s0, s1
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: mul_S_init_42:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d1, v0.d[1]
|
|
; CHECK-GI-NEXT: mov w8, #1109917696 // =0x42280000
|
|
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-GI-NEXT: mov s1, v0.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: fmov s1, w8
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
|
|
ret float %r
|
|
}
|
|
|
|
|
|
define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
|
|
; CHECK-SD-NOFP16-LABEL: fmul_reduct_reassoc_v8f16:
|
|
; CHECK-SD-NOFP16: // %bb.0:
|
|
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
|
|
; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
|
|
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
|
|
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
|
|
; CHECK-SD-NOFP16-NEXT: fmul v2.4s, v3.4s, v2.4s
|
|
; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s
|
|
; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s
|
|
; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s2, s0
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6]
|
|
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s1
|
|
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-SD-NOFP16-NEXT: ret
|
|
;
|
|
; CHECK-SD-FP16-LABEL: fmul_reduct_reassoc_v8f16:
|
|
; CHECK-SD-FP16: // %bb.0:
|
|
; CHECK-SD-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h
|
|
; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
|
|
; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1]
|
|
; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2]
|
|
; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3]
|
|
; CHECK-SD-FP16-NEXT: ret
|
|
;
|
|
; CHECK-GI-NOFP16-LABEL: fmul_reduct_reassoc_v8f16:
|
|
; CHECK-GI-NOFP16: // %bb.0:
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
|
|
; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
|
|
; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v2.4s, v0.4s
|
|
; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v1.4s
|
|
; CHECK-GI-NOFP16-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NOFP16-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v2.2s
|
|
; CHECK-GI-NOFP16-NEXT: fmul v1.2s, v1.2s, v3.2s
|
|
; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1]
|
|
; CHECK-GI-NOFP16-NEXT: mov s3, v1.s[1]
|
|
; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s2
|
|
; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s3
|
|
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
|
|
; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
|
|
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
|
|
; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
|
|
; CHECK-GI-NOFP16-NEXT: ret
|
|
;
|
|
; CHECK-GI-FP16-LABEL: fmul_reduct_reassoc_v8f16:
|
|
; CHECK-GI-FP16: // %bb.0:
|
|
; CHECK-GI-FP16-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-FP16-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v2.4h
|
|
; CHECK-GI-FP16-NEXT: fmul v1.4h, v1.4h, v3.4h
|
|
; CHECK-GI-FP16-NEXT: mov h2, v0.h[1]
|
|
; CHECK-GI-FP16-NEXT: mov h3, v0.h[2]
|
|
; CHECK-GI-FP16-NEXT: mov h4, v0.h[3]
|
|
; CHECK-GI-FP16-NEXT: mov h5, v1.h[1]
|
|
; CHECK-GI-FP16-NEXT: mov h6, v1.h[2]
|
|
; CHECK-GI-FP16-NEXT: mov h7, v1.h[3]
|
|
; CHECK-GI-FP16-NEXT: fmul h0, h0, h2
|
|
; CHECK-GI-FP16-NEXT: fmul h2, h3, h4
|
|
; CHECK-GI-FP16-NEXT: fmul h1, h1, h5
|
|
; CHECK-GI-FP16-NEXT: fmul h3, h6, h7
|
|
; CHECK-GI-FP16-NEXT: fmul h0, h0, h2
|
|
; CHECK-GI-FP16-NEXT: fmul h1, h1, h3
|
|
; CHECK-GI-FP16-NEXT: fmul h0, h0, h1
|
|
; CHECK-GI-FP16-NEXT: ret
|
|
%r1 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %a)
|
|
%r2 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %b)
|
|
%r = fmul fast half %r1, %r2
|
|
ret half %r
|
|
}
|
|
|
|
define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
|
|
; CHECK-SD-LABEL: fmul_reduct_reassoc_v8f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmul v2.4s, v2.4s, v3.4s
|
|
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmul_reduct_reassoc_v8f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: fmul v1.4s, v2.4s, v3.4s
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s
|
|
; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s
|
|
; CHECK-GI-NEXT: mov s2, v0.s[1]
|
|
; CHECK-GI-NEXT: mov s3, v1.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s2
|
|
; CHECK-GI-NEXT: fmul s1, s1, s3
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
|
|
%r2 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %b)
|
|
%r = fmul fast float %r1, %r2
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s
|
|
; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s
|
|
; CHECK-GI-NEXT: mov s2, v0.s[1]
|
|
; CHECK-GI-NEXT: mov s3, v1.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s2
|
|
; CHECK-GI-NEXT: fmul s1, s1, s3
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
|
|
%r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
|
|
%r = fmul fast float %r1, %r2
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_init:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8
|
|
; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v3.2s
|
|
; CHECK-SD-NEXT: ext v3.16b, v2.16b, v2.16b, #8
|
|
; CHECK-SD-NEXT: fmul s1, s1, v1.s[1]
|
|
; CHECK-SD-NEXT: fmul v2.2s, v2.2s, v3.2s
|
|
; CHECK-SD-NEXT: fmul s0, s0, s1
|
|
; CHECK-SD-NEXT: fmul s1, s2, v2.s[1]
|
|
; CHECK-SD-NEXT: fmul s0, s0, s1
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_init:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s
|
|
; CHECK-GI-NEXT: mov d3, v2.d[1]
|
|
; CHECK-GI-NEXT: mov s4, v1.s[1]
|
|
; CHECK-GI-NEXT: fmul v2.2s, v2.2s, v3.2s
|
|
; CHECK-GI-NEXT: fmul s1, s1, s4
|
|
; CHECK-GI-NEXT: mov s3, v2.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: fmul s1, s2, s3
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %i, <4 x float> %a)
|
|
%r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
|
|
%r = fmul fast float %r1, %r2
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
|
|
; CHECK-SD-LABEL: fmul_reduct_reassoc_v4v8f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmul v1.4s, v1.4s, v2.4s
|
|
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmul_reduct_reassoc_v4v8f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmul v1.4s, v1.4s, v2.4s
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s
|
|
; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s
|
|
; CHECK-GI-NEXT: mov s2, v0.s[1]
|
|
; CHECK-GI-NEXT: mov s3, v1.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s2
|
|
; CHECK-GI-NEXT: fmul s1, s1, s3
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
|
|
%r2 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %b)
|
|
%r = fmul fast float %r1, %r2
|
|
ret float %r
|
|
}
|
|
|
|
define double @fmul_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) {
|
|
; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f64:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmul v2.2d, v2.2d, v3.2d
|
|
; CHECK-SD-NEXT: fmul v0.2d, v0.2d, v1.2d
|
|
; CHECK-SD-NEXT: fmul v0.2d, v0.2d, v2.2d
|
|
; CHECK-SD-NEXT: fmul d0, d0, v0.d[1]
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f64:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmul v0.2d, v0.2d, v1.2d
|
|
; CHECK-GI-NEXT: fmul v1.2d, v2.2d, v3.2d
|
|
; CHECK-GI-NEXT: fmul d0, d0, v0.d[1]
|
|
; CHECK-GI-NEXT: fmul d1, d1, v1.d[1]
|
|
; CHECK-GI-NEXT: fmul d0, d0, d1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a)
|
|
%r2 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %b)
|
|
%r = fmul fast double %r1, %r2
|
|
ret double %r
|
|
}
|
|
|
|
define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_extrause:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8
|
|
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v2.2s
|
|
; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v3.2s
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: fmul s1, s1, v1.s[1]
|
|
; CHECK-SD-NEXT: fmul s1, s0, s1
|
|
; CHECK-SD-NEXT: fmul s0, s1, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_extrause:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s
|
|
; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s
|
|
; CHECK-GI-NEXT: mov s2, v0.s[1]
|
|
; CHECK-GI-NEXT: mov s3, v1.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s2
|
|
; CHECK-GI-NEXT: fmul s1, s1, s3
|
|
; CHECK-GI-NEXT: fmul s1, s0, s1
|
|
; CHECK-GI-NEXT: fmul s0, s1, s0
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
|
|
%r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
|
|
%r = fmul fast float %r1, %r2
|
|
%p = fmul float %r, %r1
|
|
ret float %p
|
|
}
|
|
|
|
; Function Attrs: nounwind readnone
|
|
declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>)
|
|
declare half @llvm.vector.reduce.fmul.f16.v8f16(half, <8 x half>)
|
|
declare half @llvm.vector.reduce.fmul.f16.v16f16(half, <16 x half>)
|
|
declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
|
|
declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
|
|
declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
|
|
declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
|
|
declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
|