[PowerPC] Add BCD add/sub/cmp builtins

Support for builtins that use bcdadd./bcdsub. to add/subtract
Binary Coded Decimal values as well as to determine validity
and compare BCD values.

Differential revision: https://reviews.llvm.org/D114088
This commit is contained in:
Nemanja Ivanovic
2021-11-23 07:32:45 -06:00
parent 0a00d64e32
commit c933c2eb33
6 changed files with 334 additions and 19 deletions

View File

@@ -524,6 +524,20 @@ let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.".
def int_ppc_altivec_vprtybq : GCCBuiltin<"__builtin_altivec_vprtybq">,
Intrinsic<[llvm_v1i128_ty],[llvm_v1i128_ty],[IntrNoMem]>;
// BCD intrinsics.
def int_ppc_bcdadd : GCCBuiltin<"__builtin_ppc_bcdadd">, Intrinsic<
[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<2>>]>;
def int_ppc_bcdadd_p : GCCBuiltin<"__builtin_ppc_bcdadd_p">, Intrinsic<
[llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
[IntrNoMem, ImmArg<ArgIndex<0>>]>;
def int_ppc_bcdsub : GCCBuiltin<"__builtin_ppc_bcdsub">, Intrinsic<
[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<2>>]>;
def int_ppc_bcdsub_p : GCCBuiltin<"__builtin_ppc_bcdsub_p">, Intrinsic<
[llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
[IntrNoMem, ImmArg<ArgIndex<0>>]>;
// P10 Vector Extract with Mask
def int_ppc_altivec_vextractbm : GCCBuiltin<"__builtin_altivec_vextractbm">,
Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;

View File

@@ -626,7 +626,9 @@ def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read],
// 5 Cycles Fixed-Point and BCD operations, 3 input operands
def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read, P10DX_Read],
(instrs
BCDADD_rec,
BCDS_rec,
BCDSUB_rec,
BCDTRUNC_rec,
VADDECUQ,
VADDEUQM,

View File

@@ -624,7 +624,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C],
BCDS_rec,
BCDTRUNC_rec,
BCDUS_rec,
BCDUTRUNC_rec
BCDUTRUNC_rec,
BCDADD_rec,
BCDSUB_rec
)>;
// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole

View File

@@ -5049,16 +5049,94 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
// value for the comparison. When selecting through a .td file, a type
// error is raised. Must check this first so we never break on the
// !Subtarget->isISA3_1() check.
if (N->getConstantOperandVal(0) == Intrinsic::ppc_fsels) {
auto IntID = N->getConstantOperandVal(0);
if (IntID == Intrinsic::ppc_fsels) {
SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3)};
CurDAG->SelectNodeTo(N, PPC::FSELS, MVT::f32, Ops);
return;
}
if (IntID == Intrinsic::ppc_bcdadd_p || IntID == Intrinsic::ppc_bcdsub_p) {
auto Pred = N->getConstantOperandVal(1);
unsigned Opcode =
IntID == Intrinsic::ppc_bcdadd_p ? PPC::BCDADD_rec : PPC::BCDSUB_rec;
unsigned SubReg = 0;
unsigned ShiftVal = 0;
bool Reverse = false;
switch (Pred) {
case 0:
SubReg = PPC::sub_eq;
ShiftVal = 1;
break;
case 1:
SubReg = PPC::sub_eq;
ShiftVal = 1;
Reverse = true;
break;
case 2:
SubReg = PPC::sub_lt;
ShiftVal = 3;
break;
case 3:
SubReg = PPC::sub_lt;
ShiftVal = 3;
Reverse = true;
break;
case 4:
SubReg = PPC::sub_gt;
ShiftVal = 2;
break;
case 5:
SubReg = PPC::sub_gt;
ShiftVal = 2;
Reverse = true;
break;
case 6:
SubReg = PPC::sub_un;
break;
case 7:
SubReg = PPC::sub_un;
Reverse = true;
break;
}
EVT VTs[] = {MVT::v16i8, MVT::Glue};
SDValue Ops[] = {N->getOperand(2), N->getOperand(3),
CurDAG->getTargetConstant(0, dl, MVT::i32)};
SDValue BCDOp = SDValue(CurDAG->getMachineNode(Opcode, dl, VTs, Ops), 0);
SDValue CR6Reg = CurDAG->getRegister(PPC::CR6, MVT::i32);
// On Power10, we can use SETBC[R]. On prior architectures, we have to use
// MFOCRF and shift/negate the value.
if (Subtarget->isISA3_1()) {
SDValue SubRegIdx = CurDAG->getTargetConstant(SubReg, dl, MVT::i32);
SDValue CRBit = SDValue(
CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i1,
CR6Reg, SubRegIdx, BCDOp.getValue(1)),
0);
CurDAG->SelectNodeTo(N, Reverse ? PPC::SETBCR : PPC::SETBC, MVT::i32,
CRBit);
} else {
SDValue Move =
SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR6Reg,
BCDOp.getValue(1)),
0);
SDValue Ops[] = {Move, getI32Imm((32 - (4 + ShiftVal)) & 31, dl),
getI32Imm(31, dl), getI32Imm(31, dl)};
if (!Reverse)
CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
else {
SDValue Shift = SDValue(
CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Shift, getI32Imm(1, dl));
}
}
return;
}
if (!Subtarget->isISA3_1())
break;
unsigned Opcode = 0;
switch (N->getConstantOperandVal(0)) {
switch (IntID) {
default:
break;
case Intrinsic::ppc_altivec_vstribr_p:

View File

@@ -1161,6 +1161,22 @@ def : Pat<(v16i8 (srl (sub v16i8:$vA, (v16i8 (bitconvert(vnot v4i32:$vB)))),
} // end HasAltivec
// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set.
class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern>
: VX_RD5_RSp5_PS1_XO9<xo,
(outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u1imm:$PS),
!strconcat(opc, " $vD, $vA, $vB, $PS"), IIC_VecFP, pattern> {
let Defs = [CR6];
}
// [PO VRT VRA VRB 1 / XO]
class VX_VT5_VA5_VB5_XO9_o<bits<9> xo, string opc, list<dag> pattern>
: VX_RD5_RSp5_PS1_XO9<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
!strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern> {
let Defs = [CR6];
let PS = 0;
}
def HasP8Altivec : Predicate<"Subtarget->hasP8Altivec()">;
def HasP8Crypto : Predicate<"Subtarget->hasP8Crypto()">;
let Predicates = [HasP8Altivec] in {
@@ -1351,6 +1367,13 @@ def VUPKHSW : VX2_Int_Ty2<1614, "vupkhsw", int_ppc_altivec_vupkhsw,
v2i64, v4i32>;
def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw,
v2i64, v4i32>;
def BCDADD_rec : VX_VT5_VA5_VB5_PS1_XO9_o<1, "bcdadd." , []>;
def BCDSUB_rec : VX_VT5_VA5_VB5_PS1_XO9_o<65, "bcdsub." , []>;
def : Pat<(v16i8 (int_ppc_bcdadd v16i8:$vA, v16i8:$vB, timm:$PS)),
(BCDADD_rec $vA, $vB, $PS)>;
def : Pat<(v16i8 (int_ppc_bcdsub v16i8:$vA, v16i8:$vB, timm:$PS)),
(BCDSUB_rec $vA, $vB, $PS)>;
// Shuffle patterns for unary and swapped (LE) vector pack modulo.
def:Pat<(vpkudum_unary_shuffle v16i8:$vA, undef),
@@ -1598,22 +1621,6 @@ def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>;
def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>;
// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set.
class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern>
: VX_RD5_RSp5_PS1_XO9<xo,
(outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u1imm:$PS),
!strconcat(opc, " $vD, $vA, $vB, $PS"), IIC_VecFP, pattern> {
let Defs = [CR6];
}
// [PO VRT VRA VRB 1 / XO]
class VX_VT5_VA5_VB5_XO9_o<bits<9> xo, string opc, list<dag> pattern>
: VX_RD5_RSp5_PS1_XO9<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
!strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern> {
let Defs = [CR6];
let PS = 0;
}
// Decimal Shift/Unsigned-Shift/Shift-and-Round
def BCDS_rec : VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>;
def BCDUS_rec : VX_VT5_VA5_VB5_XO9_o <129, "bcdus.", []>;

View File

@@ -0,0 +1,212 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-P9
define dso_local i64 @test_invalid(<16 x i8> %a) local_unnamed_addr #0 {
; CHECK-LABEL: test_invalid:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: bcdsub. v2, v2, v2, 0
; CHECK-NEXT: setbc r3, 4*cr6+un
; CHECK-NEXT: extsw r3, r3
; CHECK-NEXT: blr
;
; CHECK-P9-LABEL: test_invalid:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: bcdsub. v2, v2, v2, 0
; CHECK-P9-NEXT: mfocrf r3, 2
; CHECK-P9-NEXT: rlwinm r3, r3, 28, 31, 31
; CHECK-P9-NEXT: extsw r3, r3
; CHECK-P9-NEXT: blr
entry:
%0 = tail call i32 @llvm.ppc.bcdsub.p(i32 6, <16 x i8> %a, <16 x i8> %a) #2
%conv.i = sext i32 %0 to i64
ret i64 %conv.i
}
define dso_local <16 x i8> @test_add(<16 x i8> %a, <16 x i8> %b, i64 %ps) local_unnamed_addr #0 {
; CHECK-LABEL: test_add:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: bcdadd. v2, v2, v3, 1
; CHECK-NEXT: blr
;
; CHECK-P9-LABEL: test_add:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: bcdadd. v2, v2, v3, 1
; CHECK-P9-NEXT: blr
entry:
%0 = tail call <16 x i8> @llvm.ppc.bcdadd(<16 x i8> %a, <16 x i8> %b, i32 1)
ret <16 x i8> %0
}
define dso_local i64 @test_add_ofl(<16 x i8> %a, <16 x i8> %b, i64 %ps) local_unnamed_addr #0 {
; CHECK-LABEL: test_add_ofl:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: bcdadd. v2, v2, v3, 0
; CHECK-NEXT: setbc r3, 4*cr6+un
; CHECK-NEXT: extsw r3, r3
; CHECK-NEXT: blr
;
; CHECK-P9-LABEL: test_add_ofl:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: bcdadd. v2, v2, v3, 0
; CHECK-P9-NEXT: mfocrf r3, 2
; CHECK-P9-NEXT: rlwinm r3, r3, 28, 31, 31
; CHECK-P9-NEXT: extsw r3, r3
; CHECK-P9-NEXT: blr
entry:
%0 = tail call i32 @llvm.ppc.bcdadd.p(i32 6, <16 x i8> %a, <16 x i8> %b) #2
%conv.i = sext i32 %0 to i64
ret i64 %conv.i
}
define dso_local <16 x i8> @test_sub(<16 x i8> %a, <16 x i8> %b, i64 %ps) local_unnamed_addr #0 {
; CHECK-LABEL: test_sub:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-NEXT: blr
;
; CHECK-P9-LABEL: test_sub:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-P9-NEXT: blr
entry:
%0 = tail call <16 x i8> @llvm.ppc.bcdsub(<16 x i8> %a, <16 x i8> %b, i32 0)
ret <16 x i8> %0
}
define dso_local i64 @test_sub_ofl(<16 x i8> %a, <16 x i8> %b, i64 %ps) local_unnamed_addr #0 {
; CHECK-LABEL: test_sub_ofl:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-NEXT: setbc r3, 4*cr6+un
; CHECK-NEXT: extsw r3, r3
; CHECK-NEXT: blr
;
; CHECK-P9-LABEL: test_sub_ofl:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-P9-NEXT: mfocrf r3, 2
; CHECK-P9-NEXT: rlwinm r3, r3, 28, 31, 31
; CHECK-P9-NEXT: extsw r3, r3
; CHECK-P9-NEXT: blr
entry:
%0 = tail call i32 @llvm.ppc.bcdsub.p(i32 6, <16 x i8> %a, <16 x i8> %b) #2
%conv.i = sext i32 %0 to i64
ret i64 %conv.i
}
define dso_local i64 @test_cmplt(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_cmplt:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-NEXT: setbc r3, 4*cr6+lt
; CHECK-NEXT: extsw r3, r3
; CHECK-NEXT: blr
;
; CHECK-P9-LABEL: test_cmplt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-P9-NEXT: mfocrf r3, 2
; CHECK-P9-NEXT: rlwinm r3, r3, 25, 31, 31
; CHECK-P9-NEXT: extsw r3, r3
; CHECK-P9-NEXT: blr
entry:
%0 = tail call i32 @llvm.ppc.bcdsub.p(i32 2, <16 x i8> %a, <16 x i8> %b) #2
%conv.i = sext i32 %0 to i64
ret i64 %conv.i
}
define dso_local i64 @test_cmpgt(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_cmpgt:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-NEXT: setbc r3, 4*cr6+gt
; CHECK-NEXT: extsw r3, r3
; CHECK-NEXT: blr
;
; CHECK-P9-LABEL: test_cmpgt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-P9-NEXT: mfocrf r3, 2
; CHECK-P9-NEXT: rlwinm r3, r3, 26, 31, 31
; CHECK-P9-NEXT: extsw r3, r3
; CHECK-P9-NEXT: blr
entry:
%0 = tail call i32 @llvm.ppc.bcdsub.p(i32 4, <16 x i8> %a, <16 x i8> %b) #2
%conv.i = sext i32 %0 to i64
ret i64 %conv.i
}
define dso_local i64 @test_cmpeq(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_cmpeq:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-NEXT: setbc r3, 4*cr6+eq
; CHECK-NEXT: extsw r3, r3
; CHECK-NEXT: blr
;
; CHECK-P9-LABEL: test_cmpeq:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-P9-NEXT: mfocrf r3, 2
; CHECK-P9-NEXT: rlwinm r3, r3, 27, 31, 31
; CHECK-P9-NEXT: extsw r3, r3
; CHECK-P9-NEXT: blr
entry:
%0 = tail call i32 @llvm.ppc.bcdsub.p(i32 0, <16 x i8> %a, <16 x i8> %b) #2
%conv.i = sext i32 %0 to i64
ret i64 %conv.i
}
define dso_local i64 @test_cmpge(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_cmpge:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-NEXT: setbcr r3, 4*cr6+lt
; CHECK-NEXT: extsw r3, r3
; CHECK-NEXT: blr
;
; CHECK-P9-LABEL: test_cmpge:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-P9-NEXT: mfocrf r3, 2
; CHECK-P9-NEXT: rlwinm r3, r3, 25, 31, 31
; CHECK-P9-NEXT: xori r3, r3, 1
; CHECK-P9-NEXT: extsw r3, r3
; CHECK-P9-NEXT: blr
entry:
%0 = tail call i32 @llvm.ppc.bcdsub.p(i32 3, <16 x i8> %a, <16 x i8> %b) #2
%conv.i = sext i32 %0 to i64
ret i64 %conv.i
}
define dso_local i64 @test_cmple(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_cmple:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-NEXT: setbcr r3, 4*cr6+gt
; CHECK-NEXT: extsw r3, r3
; CHECK-NEXT: blr
;
; CHECK-P9-LABEL: test_cmple:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: bcdsub. v2, v2, v3, 0
; CHECK-P9-NEXT: mfocrf r3, 2
; CHECK-P9-NEXT: rlwinm r3, r3, 26, 31, 31
; CHECK-P9-NEXT: xori r3, r3, 1
; CHECK-P9-NEXT: extsw r3, r3
; CHECK-P9-NEXT: blr
entry:
%0 = tail call i32 @llvm.ppc.bcdsub.p(i32 5, <16 x i8> %a, <16 x i8> %b) #2
%conv.i = sext i32 %0 to i64
ret i64 %conv.i
}
declare i32 @llvm.ppc.bcdsub.p(i32 immarg, <16 x i8>, <16 x i8>) #1
declare i32 @llvm.ppc.bcdadd.p(i32 immarg, <16 x i8>, <16 x i8>) #1
declare <16 x i8> @llvm.ppc.bcdadd(<16 x i8>, <16 x i8>, i32 immarg) #1
declare <16 x i8> @llvm.ppc.bcdsub(<16 x i8>, <16 x i8>, i32 immarg) #1