Refresh of the generic scheduling model to use A510 instead of A55. Main benefits are to the little core, and introducing SVE scheduling information. Changes tested on various OoO cores, no performance degradation is seen. Differential Revision: https://reviews.llvm.org/D156799
493 lines
22 KiB
LLVM
493 lines
22 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -verify-machineinstrs < %s | FileCheck %s
|
|
|
|
declare { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>)
|
|
|
|
define <vscale x 2 x i8> @smulo_nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y) {
|
|
; CHECK-LABEL: smulo_nxv2i8:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.d
|
|
; CHECK-NEXT: sxtb z1.d, p0/m, z1.d
|
|
; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
|
|
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
|
|
; CHECK-NEXT: movprfx z1, z0
|
|
; CHECK-NEXT: sxtb z1.d, p0/m, z0.d
|
|
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z0.d
|
|
; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y)
|
|
%b = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i1> } %a, 1
|
|
%d = select <vscale x 2 x i1> %c, <vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> %b
|
|
ret <vscale x 2 x i8> %d
|
|
}
|
|
|
|
declare { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>)
|
|
|
|
define <vscale x 4 x i8> @smulo_nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y) {
|
|
; CHECK-LABEL: smulo_nxv4i8:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.s
|
|
; CHECK-NEXT: sxtb z1.s, p0/m, z1.s
|
|
; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
|
|
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
|
|
; CHECK-NEXT: movprfx z1, z0
|
|
; CHECK-NEXT: sxtb z1.s, p0/m, z0.s
|
|
; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, z0.s
|
|
; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y)
|
|
%b = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i1> } %a, 1
|
|
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> %b
|
|
ret <vscale x 4 x i8> %d
|
|
}
|
|
|
|
declare { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
|
|
|
|
define <vscale x 8 x i8> @smulo_nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) {
|
|
; CHECK-LABEL: smulo_nxv8i8:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.h
|
|
; CHECK-NEXT: sxtb z1.h, p0/m, z1.h
|
|
; CHECK-NEXT: sxtb z0.h, p0/m, z0.h
|
|
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
|
|
; CHECK-NEXT: movprfx z1, z0
|
|
; CHECK-NEXT: sxtb z1.h, p0/m, z0.h
|
|
; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, z0.h
|
|
; CHECK-NEXT: mov z0.h, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y)
|
|
%b = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i1> } %a, 1
|
|
%d = select <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> %b
|
|
ret <vscale x 8 x i8> %d
|
|
}
|
|
|
|
declare { <vscale x 16 x i8>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
|
|
|
|
define <vscale x 16 x i8> @smulo_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
|
|
; CHECK-LABEL: smulo_nxv16i8:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.b
|
|
; CHECK-NEXT: movprfx z2, z0
|
|
; CHECK-NEXT: mul z2.b, p0/m, z2.b, z1.b
|
|
; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
|
|
; CHECK-NEXT: asr z1.b, z2.b, #7
|
|
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b
|
|
; CHECK-NEXT: mov z2.b, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z2.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 16 x i8>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y)
|
|
%b = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i1> } %a, 1
|
|
%d = select <vscale x 16 x i1> %c, <vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> %b
|
|
ret <vscale x 16 x i8> %d
|
|
}
|
|
|
|
declare { <vscale x 32 x i8>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>)
|
|
|
|
define <vscale x 32 x i8> @smulo_nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %y) {
|
|
; CHECK-LABEL: smulo_nxv32i8:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.b
|
|
; CHECK-NEXT: movprfx z4, z1
|
|
; CHECK-NEXT: mul z4.b, p0/m, z4.b, z3.b
|
|
; CHECK-NEXT: movprfx z5, z0
|
|
; CHECK-NEXT: mul z5.b, p0/m, z5.b, z2.b
|
|
; CHECK-NEXT: smulh z1.b, p0/m, z1.b, z3.b
|
|
; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z2.b
|
|
; CHECK-NEXT: asr z2.b, z4.b, #7
|
|
; CHECK-NEXT: asr z3.b, z5.b, #7
|
|
; CHECK-NEXT: cmpne p1.b, p0/z, z1.b, z2.b
|
|
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z3.b
|
|
; CHECK-NEXT: mov z5.b, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z4.b, p1/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z5.d
|
|
; CHECK-NEXT: mov z1.d, z4.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 32 x i8>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %y)
|
|
%b = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i1> } %a, 1
|
|
%d = select <vscale x 32 x i1> %c, <vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> %b
|
|
ret <vscale x 32 x i8> %d
|
|
}
|
|
|
|
declare { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.smul.with.overflow.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>)
|
|
|
|
define <vscale x 64 x i8> @smulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y) {
|
|
; CHECK-LABEL: smulo_nxv64i8:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.b
|
|
; CHECK-NEXT: movprfx z24, z3
|
|
; CHECK-NEXT: mul z24.b, p0/m, z24.b, z7.b
|
|
; CHECK-NEXT: movprfx z25, z0
|
|
; CHECK-NEXT: mul z25.b, p0/m, z25.b, z4.b
|
|
; CHECK-NEXT: movprfx z26, z2
|
|
; CHECK-NEXT: mul z26.b, p0/m, z26.b, z6.b
|
|
; CHECK-NEXT: movprfx z27, z1
|
|
; CHECK-NEXT: mul z27.b, p0/m, z27.b, z5.b
|
|
; CHECK-NEXT: smulh z3.b, p0/m, z3.b, z7.b
|
|
; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z4.b
|
|
; CHECK-NEXT: asr z4.b, z25.b, #7
|
|
; CHECK-NEXT: smulh z2.b, p0/m, z2.b, z6.b
|
|
; CHECK-NEXT: smulh z1.b, p0/m, z1.b, z5.b
|
|
; CHECK-NEXT: asr z5.b, z24.b, #7
|
|
; CHECK-NEXT: asr z6.b, z26.b, #7
|
|
; CHECK-NEXT: asr z7.b, z27.b, #7
|
|
; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, z4.b
|
|
; CHECK-NEXT: cmpne p2.b, p0/z, z3.b, z5.b
|
|
; CHECK-NEXT: cmpne p3.b, p0/z, z2.b, z6.b
|
|
; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, z7.b
|
|
; CHECK-NEXT: mov z25.b, p1/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z24.b, p2/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z27.b, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z26.b, p3/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z25.d
|
|
; CHECK-NEXT: mov z3.d, z24.d
|
|
; CHECK-NEXT: mov z1.d, z27.d
|
|
; CHECK-NEXT: mov z2.d, z26.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.smul.with.overflow.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y)
|
|
%b = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 1
|
|
%d = select <vscale x 64 x i1> %c, <vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> %b
|
|
ret <vscale x 64 x i8> %d
|
|
}
|
|
|
|
declare { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
|
|
|
|
define <vscale x 2 x i16> @smulo_nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y) {
|
|
; CHECK-LABEL: smulo_nxv2i16:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.d
|
|
; CHECK-NEXT: sxth z1.d, p0/m, z1.d
|
|
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
|
|
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
|
|
; CHECK-NEXT: movprfx z1, z0
|
|
; CHECK-NEXT: sxth z1.d, p0/m, z0.d
|
|
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z0.d
|
|
; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y)
|
|
%b = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i1> } %a, 1
|
|
%d = select <vscale x 2 x i1> %c, <vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> %b
|
|
ret <vscale x 2 x i16> %d
|
|
}
|
|
|
|
declare { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
|
|
|
|
define <vscale x 4 x i16> @smulo_nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
|
|
; CHECK-LABEL: smulo_nxv4i16:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.s
|
|
; CHECK-NEXT: sxth z1.s, p0/m, z1.s
|
|
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
|
|
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
|
|
; CHECK-NEXT: movprfx z1, z0
|
|
; CHECK-NEXT: sxth z1.s, p0/m, z0.s
|
|
; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, z0.s
|
|
; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y)
|
|
%b = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i1> } %a, 1
|
|
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> %b
|
|
ret <vscale x 4 x i16> %d
|
|
}
|
|
|
|
declare { <vscale x 8 x i16>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
|
|
|
|
define <vscale x 8 x i16> @smulo_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
|
|
; CHECK-LABEL: smulo_nxv8i16:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.h
|
|
; CHECK-NEXT: movprfx z2, z0
|
|
; CHECK-NEXT: mul z2.h, p0/m, z2.h, z1.h
|
|
; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
|
|
; CHECK-NEXT: asr z1.h, z2.h, #15
|
|
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, z1.h
|
|
; CHECK-NEXT: mov z2.h, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z2.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 8 x i16>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y)
|
|
%b = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i1> } %a, 1
|
|
%d = select <vscale x 8 x i1> %c, <vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> %b
|
|
ret <vscale x 8 x i16> %d
|
|
}
|
|
|
|
declare { <vscale x 16 x i16>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
|
|
|
|
define <vscale x 16 x i16> @smulo_nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %y) {
|
|
; CHECK-LABEL: smulo_nxv16i16:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.h
|
|
; CHECK-NEXT: movprfx z4, z1
|
|
; CHECK-NEXT: mul z4.h, p0/m, z4.h, z3.h
|
|
; CHECK-NEXT: movprfx z5, z0
|
|
; CHECK-NEXT: mul z5.h, p0/m, z5.h, z2.h
|
|
; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z3.h
|
|
; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z2.h
|
|
; CHECK-NEXT: asr z2.h, z4.h, #15
|
|
; CHECK-NEXT: asr z3.h, z5.h, #15
|
|
; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, z2.h
|
|
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, z3.h
|
|
; CHECK-NEXT: mov z5.h, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z4.h, p1/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z5.d
|
|
; CHECK-NEXT: mov z1.d, z4.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 16 x i16>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %y)
|
|
%b = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i1> } %a, 1
|
|
%d = select <vscale x 16 x i1> %c, <vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> %b
|
|
ret <vscale x 16 x i16> %d
|
|
}
|
|
|
|
declare { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>)
|
|
|
|
define <vscale x 32 x i16> @smulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y) {
|
|
; CHECK-LABEL: smulo_nxv32i16:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.h
|
|
; CHECK-NEXT: movprfx z24, z3
|
|
; CHECK-NEXT: mul z24.h, p0/m, z24.h, z7.h
|
|
; CHECK-NEXT: movprfx z25, z0
|
|
; CHECK-NEXT: mul z25.h, p0/m, z25.h, z4.h
|
|
; CHECK-NEXT: movprfx z26, z2
|
|
; CHECK-NEXT: mul z26.h, p0/m, z26.h, z6.h
|
|
; CHECK-NEXT: movprfx z27, z1
|
|
; CHECK-NEXT: mul z27.h, p0/m, z27.h, z5.h
|
|
; CHECK-NEXT: smulh z3.h, p0/m, z3.h, z7.h
|
|
; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z4.h
|
|
; CHECK-NEXT: asr z4.h, z25.h, #15
|
|
; CHECK-NEXT: smulh z2.h, p0/m, z2.h, z6.h
|
|
; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z5.h
|
|
; CHECK-NEXT: asr z5.h, z24.h, #15
|
|
; CHECK-NEXT: asr z6.h, z26.h, #15
|
|
; CHECK-NEXT: asr z7.h, z27.h, #15
|
|
; CHECK-NEXT: cmpne p1.h, p0/z, z0.h, z4.h
|
|
; CHECK-NEXT: cmpne p2.h, p0/z, z3.h, z5.h
|
|
; CHECK-NEXT: cmpne p3.h, p0/z, z2.h, z6.h
|
|
; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, z7.h
|
|
; CHECK-NEXT: mov z25.h, p1/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z24.h, p2/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z27.h, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z26.h, p3/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z25.d
|
|
; CHECK-NEXT: mov z3.d, z24.d
|
|
; CHECK-NEXT: mov z1.d, z27.d
|
|
; CHECK-NEXT: mov z2.d, z26.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y)
|
|
%b = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 1
|
|
%d = select <vscale x 32 x i1> %c, <vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> %b
|
|
ret <vscale x 32 x i16> %d
|
|
}
|
|
|
|
declare { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
|
|
|
|
define <vscale x 2 x i32> @smulo_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) {
|
|
; CHECK-LABEL: smulo_nxv2i32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.d
|
|
; CHECK-NEXT: sxtw z1.d, p0/m, z1.d
|
|
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
|
|
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
|
|
; CHECK-NEXT: movprfx z1, z0
|
|
; CHECK-NEXT: sxtw z1.d, p0/m, z0.d
|
|
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z0.d
|
|
; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y)
|
|
%b = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i1> } %a, 1
|
|
%d = select <vscale x 2 x i1> %c, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %b
|
|
ret <vscale x 2 x i32> %d
|
|
}
|
|
|
|
declare { <vscale x 4 x i32>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
|
|
|
|
define <vscale x 4 x i32> @smulo_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
|
|
; CHECK-LABEL: smulo_nxv4i32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.s
|
|
; CHECK-NEXT: movprfx z2, z0
|
|
; CHECK-NEXT: mul z2.s, p0/m, z2.s, z1.s
|
|
; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
|
|
; CHECK-NEXT: asr z1.s, z2.s, #31
|
|
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, z1.s
|
|
; CHECK-NEXT: mov z2.s, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z2.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 4 x i32>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
|
|
%b = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } %a, 1
|
|
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> %b
|
|
ret <vscale x 4 x i32> %d
|
|
}
|
|
|
|
declare { <vscale x 8 x i32>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
|
|
|
|
define <vscale x 8 x i32> @smulo_nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y) {
|
|
; CHECK-LABEL: smulo_nxv8i32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.s
|
|
; CHECK-NEXT: movprfx z4, z1
|
|
; CHECK-NEXT: mul z4.s, p0/m, z4.s, z3.s
|
|
; CHECK-NEXT: movprfx z5, z0
|
|
; CHECK-NEXT: mul z5.s, p0/m, z5.s, z2.s
|
|
; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z3.s
|
|
; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z2.s
|
|
; CHECK-NEXT: asr z2.s, z4.s, #31
|
|
; CHECK-NEXT: asr z3.s, z5.s, #31
|
|
; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, z2.s
|
|
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, z3.s
|
|
; CHECK-NEXT: mov z5.s, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z5.d
|
|
; CHECK-NEXT: mov z1.d, z4.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 8 x i32>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y)
|
|
%b = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i1> } %a, 1
|
|
%d = select <vscale x 8 x i1> %c, <vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> %b
|
|
ret <vscale x 8 x i32> %d
|
|
}
|
|
|
|
declare { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>)
|
|
|
|
define <vscale x 16 x i32> @smulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y) {
|
|
; CHECK-LABEL: smulo_nxv16i32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.s
|
|
; CHECK-NEXT: movprfx z24, z3
|
|
; CHECK-NEXT: mul z24.s, p0/m, z24.s, z7.s
|
|
; CHECK-NEXT: movprfx z25, z0
|
|
; CHECK-NEXT: mul z25.s, p0/m, z25.s, z4.s
|
|
; CHECK-NEXT: movprfx z26, z2
|
|
; CHECK-NEXT: mul z26.s, p0/m, z26.s, z6.s
|
|
; CHECK-NEXT: movprfx z27, z1
|
|
; CHECK-NEXT: mul z27.s, p0/m, z27.s, z5.s
|
|
; CHECK-NEXT: smulh z3.s, p0/m, z3.s, z7.s
|
|
; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z4.s
|
|
; CHECK-NEXT: asr z4.s, z25.s, #31
|
|
; CHECK-NEXT: smulh z2.s, p0/m, z2.s, z6.s
|
|
; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z5.s
|
|
; CHECK-NEXT: asr z5.s, z24.s, #31
|
|
; CHECK-NEXT: asr z6.s, z26.s, #31
|
|
; CHECK-NEXT: asr z7.s, z27.s, #31
|
|
; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, z4.s
|
|
; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, z5.s
|
|
; CHECK-NEXT: cmpne p3.s, p0/z, z2.s, z6.s
|
|
; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, z7.s
|
|
; CHECK-NEXT: mov z25.s, p1/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z24.s, p2/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z27.s, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z26.s, p3/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z25.d
|
|
; CHECK-NEXT: mov z3.d, z24.d
|
|
; CHECK-NEXT: mov z1.d, z27.d
|
|
; CHECK-NEXT: mov z2.d, z26.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y)
|
|
%b = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 1
|
|
%d = select <vscale x 16 x i1> %c, <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> %b
|
|
ret <vscale x 16 x i32> %d
|
|
}
|
|
|
|
declare { <vscale x 2 x i64>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
|
|
|
|
define <vscale x 2 x i64> @smulo_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
|
|
; CHECK-LABEL: smulo_nxv2i64:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.d
|
|
; CHECK-NEXT: movprfx z2, z0
|
|
; CHECK-NEXT: mul z2.d, p0/m, z2.d, z1.d
|
|
; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
|
|
; CHECK-NEXT: asr z1.d, z2.d, #63
|
|
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, z1.d
|
|
; CHECK-NEXT: mov z2.d, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z2.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 2 x i64>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
|
|
%b = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i1> } %a, 1
|
|
%d = select <vscale x 2 x i1> %c, <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> %b
|
|
ret <vscale x 2 x i64> %d
|
|
}
|
|
|
|
declare { <vscale x 4 x i64>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
|
|
|
|
define <vscale x 4 x i64> @smulo_nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y) {
|
|
; CHECK-LABEL: smulo_nxv4i64:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.d
|
|
; CHECK-NEXT: movprfx z4, z1
|
|
; CHECK-NEXT: mul z4.d, p0/m, z4.d, z3.d
|
|
; CHECK-NEXT: movprfx z5, z0
|
|
; CHECK-NEXT: mul z5.d, p0/m, z5.d, z2.d
|
|
; CHECK-NEXT: smulh z1.d, p0/m, z1.d, z3.d
|
|
; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z2.d
|
|
; CHECK-NEXT: asr z2.d, z4.d, #63
|
|
; CHECK-NEXT: asr z3.d, z5.d, #63
|
|
; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, z2.d
|
|
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, z3.d
|
|
; CHECK-NEXT: mov z5.d, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z5.d
|
|
; CHECK-NEXT: mov z1.d, z4.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 4 x i64>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y)
|
|
%b = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i1> } %a, 1
|
|
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> %b
|
|
ret <vscale x 4 x i64> %d
|
|
}
|
|
|
|
declare { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
|
|
|
|
define <vscale x 8 x i64> @smulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y) {
|
|
; CHECK-LABEL: smulo_nxv8i64:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ptrue p0.d
|
|
; CHECK-NEXT: movprfx z24, z3
|
|
; CHECK-NEXT: mul z24.d, p0/m, z24.d, z7.d
|
|
; CHECK-NEXT: movprfx z25, z0
|
|
; CHECK-NEXT: mul z25.d, p0/m, z25.d, z4.d
|
|
; CHECK-NEXT: movprfx z26, z2
|
|
; CHECK-NEXT: mul z26.d, p0/m, z26.d, z6.d
|
|
; CHECK-NEXT: movprfx z27, z1
|
|
; CHECK-NEXT: mul z27.d, p0/m, z27.d, z5.d
|
|
; CHECK-NEXT: smulh z3.d, p0/m, z3.d, z7.d
|
|
; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z4.d
|
|
; CHECK-NEXT: asr z4.d, z25.d, #63
|
|
; CHECK-NEXT: smulh z2.d, p0/m, z2.d, z6.d
|
|
; CHECK-NEXT: smulh z1.d, p0/m, z1.d, z5.d
|
|
; CHECK-NEXT: asr z5.d, z24.d, #63
|
|
; CHECK-NEXT: asr z6.d, z26.d, #63
|
|
; CHECK-NEXT: asr z7.d, z27.d, #63
|
|
; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, z4.d
|
|
; CHECK-NEXT: cmpne p2.d, p0/z, z3.d, z5.d
|
|
; CHECK-NEXT: cmpne p3.d, p0/z, z2.d, z6.d
|
|
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z7.d
|
|
; CHECK-NEXT: mov z25.d, p1/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z24.d, p2/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z27.d, p0/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z26.d, p3/m, #0 // =0x0
|
|
; CHECK-NEXT: mov z0.d, z25.d
|
|
; CHECK-NEXT: mov z3.d, z24.d
|
|
; CHECK-NEXT: mov z1.d, z27.d
|
|
; CHECK-NEXT: mov z2.d, z26.d
|
|
; CHECK-NEXT: ret
|
|
%a = call { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y)
|
|
%b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 0
|
|
%c = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 1
|
|
%d = select <vscale x 8 x i1> %c, <vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> %b
|
|
ret <vscale x 8 x i64> %d
|
|
}
|