[X86] combineCMP - attempt to simplify KSHIFTR mask element extractions when just comparing against zero
We can just bitcast the pre-shifted mask as an integer and use TEST/BT directly. This can be extended further to better handle sub-i8 mask cases, but just getting rid of KSHIFTR nodes makes a notable difference.
This commit is contained in:
@@ -53487,6 +53487,7 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
|
||||
SDLoc dl(N);
|
||||
SDValue Op = N->getOperand(0);
|
||||
EVT VT = Op.getValueType();
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
|
||||
// If we have a constant logical shift that's only used in a comparison
|
||||
// against zero turn it into an equivalent AND. This allows turning it into
|
||||
@@ -53510,12 +53511,42 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
|
||||
}
|
||||
}
|
||||
|
||||
// If we're extracting from a avx512 bool vector and comparing against zero,
|
||||
// then try to just bitcast the vector to an integer to use TEST/BT directly.
|
||||
// TODO: Handle v2i1/v4i1 bool vector cases.
|
||||
// (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
|
||||
if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
|
||||
Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
|
||||
SDValue Src = Op.getOperand(0);
|
||||
if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
|
||||
isNullConstant(Src.getOperand(1)) &&
|
||||
Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
|
||||
SDValue BoolVec = Src.getOperand(0);
|
||||
EVT VecVT = BoolVec.getValueType();
|
||||
unsigned BitWidth = VecVT.getVectorNumElements();
|
||||
EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
|
||||
if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
|
||||
unsigned ShAmt = 0;
|
||||
if (BoolVec.getOpcode() == X86ISD::KSHIFTR &&
|
||||
BoolVec.getConstantOperandAPInt(1).ult(BitWidth)) {
|
||||
ShAmt = BoolVec.getConstantOperandVal(1);
|
||||
BoolVec = BoolVec.getOperand(0);
|
||||
}
|
||||
APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
|
||||
Op = DAG.getBitcast(BCVT, BoolVec);
|
||||
Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
|
||||
DAG.getConstant(Mask, dl, BCVT));
|
||||
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
|
||||
DAG.getConstant(0, dl, VT));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Peek through any zero-extend if we're only testing for a zero result.
|
||||
if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
|
||||
SDValue Src = Op.getOperand(0);
|
||||
EVT SrcVT = Src.getValueType();
|
||||
if (SrcVT.getScalarSizeInBits() >= 8 &&
|
||||
DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
|
||||
if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
|
||||
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
|
||||
DAG.getConstant(0, dl, SrcVT));
|
||||
}
|
||||
|
||||
@@ -175,9 +175,8 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) nounwind {
|
||||
; KNL-LABEL: test11:
|
||||
; KNL: ## %bb.0:
|
||||
; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
|
||||
; KNL-NEXT: kshiftrw $4, %k0, %k0
|
||||
; KNL-NEXT: kmovw %k0, %eax
|
||||
; KNL-NEXT: testb $1, %al
|
||||
; KNL-NEXT: testb $16, %al
|
||||
; KNL-NEXT: je LBB10_2
|
||||
; KNL-NEXT: ## %bb.1: ## %A
|
||||
; KNL-NEXT: vmovdqa64 %zmm1, %zmm0
|
||||
@@ -189,9 +188,8 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) nounwind {
|
||||
; SKX-LABEL: test11:
|
||||
; SKX: ## %bb.0:
|
||||
; SKX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
|
||||
; SKX-NEXT: kshiftrw $4, %k0, %k0
|
||||
; SKX-NEXT: kmovd %k0, %eax
|
||||
; SKX-NEXT: testb $1, %al
|
||||
; SKX-NEXT: testb $16, %al
|
||||
; SKX-NEXT: je LBB10_2
|
||||
; SKX-NEXT: ## %bb.1: ## %A
|
||||
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
|
||||
@@ -276,9 +274,8 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) nounwind {
|
||||
; KNL: ## %bb.0:
|
||||
; KNL-NEXT: movq %rdi, %rax
|
||||
; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
|
||||
; KNL-NEXT: kshiftrw $4, %k0, %k0
|
||||
; KNL-NEXT: kmovw %k0, %ecx
|
||||
; KNL-NEXT: testb $1, %cl
|
||||
; KNL-NEXT: testb $16, %cl
|
||||
; KNL-NEXT: cmoveq %rsi, %rax
|
||||
; KNL-NEXT: vzeroupper
|
||||
; KNL-NEXT: retq
|
||||
@@ -287,9 +284,8 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) nounwind {
|
||||
; SKX: ## %bb.0:
|
||||
; SKX-NEXT: movq %rdi, %rax
|
||||
; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
|
||||
; SKX-NEXT: kshiftrb $4, %k0, %k0
|
||||
; SKX-NEXT: kmovd %k0, %ecx
|
||||
; SKX-NEXT: testb $1, %cl
|
||||
; SKX-NEXT: testb $16, %cl
|
||||
; SKX-NEXT: cmoveq %rsi, %rax
|
||||
; SKX-NEXT: vzeroupper
|
||||
; SKX-NEXT: retq
|
||||
|
||||
@@ -4350,10 +4350,9 @@ define i32 @PR39665_c_ray(<2 x double> %x, <2 x double> %y) {
|
||||
; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
|
||||
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
|
||||
; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k0
|
||||
; KNL-NEXT: kshiftrw $1, %k0, %k1
|
||||
; KNL-NEXT: kmovw %k1, %eax
|
||||
; KNL-NEXT: kmovw %k0, %ecx
|
||||
; KNL-NEXT: testb $1, %al
|
||||
; KNL-NEXT: kmovw %k0, %eax
|
||||
; KNL-NEXT: testb $2, %al
|
||||
; KNL-NEXT: movl $42, %eax
|
||||
; KNL-NEXT: movl $99, %edx
|
||||
; KNL-NEXT: cmovel %edx, %eax
|
||||
@@ -4365,10 +4364,9 @@ define i32 @PR39665_c_ray(<2 x double> %x, <2 x double> %y) {
|
||||
; SKX-LABEL: PR39665_c_ray:
|
||||
; SKX: # %bb.0:
|
||||
; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0
|
||||
; SKX-NEXT: kshiftrb $1, %k0, %k1
|
||||
; SKX-NEXT: kmovd %k1, %eax
|
||||
; SKX-NEXT: kmovd %k0, %ecx
|
||||
; SKX-NEXT: testb $1, %al
|
||||
; SKX-NEXT: kmovd %k0, %eax
|
||||
; SKX-NEXT: testb $2, %al
|
||||
; SKX-NEXT: movl $42, %eax
|
||||
; SKX-NEXT: movl $99, %edx
|
||||
; SKX-NEXT: cmovel %edx, %eax
|
||||
|
||||
@@ -11,32 +11,30 @@ target triple = "x86_64-unknown-linux-gnu"
|
||||
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
|
||||
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
||||
; KNL-NEXT: kshiftrw $2, %k0, %k1
|
||||
; KNL-NEXT: kshiftrw $1, %k1, %k2
|
||||
; KNL-NEXT: kmovw %k1, %eax
|
||||
; KNL-NEXT: testb $1, %al
|
||||
; KNL-NEXT: kmovw %k1, %ecx
|
||||
; KNL-NEXT: testb $1, %cl
|
||||
; KNL-NEXT: fld1
|
||||
; KNL-NEXT: fldz
|
||||
; KNL-NEXT: fld %st(0)
|
||||
; KNL-NEXT: fcmovne %st(2), %st
|
||||
; KNL-NEXT: kmovw %k2, %eax
|
||||
; KNL-NEXT: testb $1, %al
|
||||
; KNL-NEXT: testb $2, %al
|
||||
; KNL-NEXT: fld %st(1)
|
||||
; KNL-NEXT: fcmovne %st(3), %st
|
||||
; KNL-NEXT: kshiftrw $1, %k0, %k1
|
||||
; KNL-NEXT: kmovw %k1, %eax
|
||||
; KNL-NEXT: testb $1, %al
|
||||
; KNL-NEXT: kmovw %k0, %eax
|
||||
; KNL-NEXT: kmovw %k0, %ecx
|
||||
; KNL-NEXT: testb $1, %cl
|
||||
; KNL-NEXT: fld %st(2)
|
||||
; KNL-NEXT: fcmovne %st(4), %st
|
||||
; KNL-NEXT: kmovw %k0, %eax
|
||||
; KNL-NEXT: testb $1, %al
|
||||
; KNL-NEXT: testb $2, %al
|
||||
; KNL-NEXT: fxch %st(3)
|
||||
; KNL-NEXT: fcmovne %st(4), %st
|
||||
; KNL-NEXT: fstp %st(4)
|
||||
; KNL-NEXT: fxch %st(3)
|
||||
; KNL-NEXT: fstpt (%rdi)
|
||||
; KNL-NEXT: fxch %st(1)
|
||||
; KNL-NEXT: fstpt 10(%rdi)
|
||||
; KNL-NEXT: fxch %st(1)
|
||||
; KNL-NEXT: fstpt (%rdi)
|
||||
; KNL-NEXT: fxch %st(1)
|
||||
; KNL-NEXT: fstpt 30(%rdi)
|
||||
; KNL-NEXT: fstpt 20(%rdi)
|
||||
; KNL-NEXT: vzeroupper
|
||||
@@ -47,32 +45,30 @@ target triple = "x86_64-unknown-linux-gnu"
|
||||
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
|
||||
; SKX-NEXT: vpmovd2m %xmm0, %k0
|
||||
; SKX-NEXT: kshiftrb $2, %k0, %k1
|
||||
; SKX-NEXT: kshiftrb $1, %k1, %k2
|
||||
; SKX-NEXT: kmovd %k1, %eax
|
||||
; SKX-NEXT: testb $1, %al
|
||||
; SKX-NEXT: kmovd %k1, %ecx
|
||||
; SKX-NEXT: testb $1, %cl
|
||||
; SKX-NEXT: fld1
|
||||
; SKX-NEXT: fldz
|
||||
; SKX-NEXT: fld %st(0)
|
||||
; SKX-NEXT: fcmovne %st(2), %st
|
||||
; SKX-NEXT: kmovd %k2, %eax
|
||||
; SKX-NEXT: testb $1, %al
|
||||
; SKX-NEXT: testb $2, %al
|
||||
; SKX-NEXT: fld %st(1)
|
||||
; SKX-NEXT: fcmovne %st(3), %st
|
||||
; SKX-NEXT: kshiftrb $1, %k0, %k1
|
||||
; SKX-NEXT: kmovd %k1, %eax
|
||||
; SKX-NEXT: testb $1, %al
|
||||
; SKX-NEXT: kmovd %k0, %eax
|
||||
; SKX-NEXT: kmovd %k0, %ecx
|
||||
; SKX-NEXT: testb $1, %cl
|
||||
; SKX-NEXT: fld %st(2)
|
||||
; SKX-NEXT: fcmovne %st(4), %st
|
||||
; SKX-NEXT: kmovd %k0, %eax
|
||||
; SKX-NEXT: testb $1, %al
|
||||
; SKX-NEXT: testb $2, %al
|
||||
; SKX-NEXT: fxch %st(3)
|
||||
; SKX-NEXT: fcmovne %st(4), %st
|
||||
; SKX-NEXT: fstp %st(4)
|
||||
; SKX-NEXT: fxch %st(3)
|
||||
; SKX-NEXT: fstpt (%rdi)
|
||||
; SKX-NEXT: fxch %st(1)
|
||||
; SKX-NEXT: fstpt 10(%rdi)
|
||||
; SKX-NEXT: fxch %st(1)
|
||||
; SKX-NEXT: fstpt (%rdi)
|
||||
; SKX-NEXT: fxch %st(1)
|
||||
; SKX-NEXT: fstpt 30(%rdi)
|
||||
; SKX-NEXT: fstpt 20(%rdi)
|
||||
; SKX-NEXT: retq
|
||||
|
||||
@@ -49,24 +49,22 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
|
||||
; AVX512VL-LABEL: test:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
|
||||
; AVX512VL-NEXT: kshiftrb $1, %k0, %k1
|
||||
; AVX512VL-NEXT: kshiftrb $2, %k0, %k2
|
||||
; AVX512VL-NEXT: kmovd %k0, %eax
|
||||
; AVX512VL-NEXT: testb $1, %al
|
||||
; AVX512VL-NEXT: kshiftrb $2, %k0, %k1
|
||||
; AVX512VL-NEXT: kmovd %k0, %ecx
|
||||
; AVX512VL-NEXT: testb $2, %cl
|
||||
; AVX512VL-NEXT: fld1
|
||||
; AVX512VL-NEXT: fldz
|
||||
; AVX512VL-NEXT: fld %st(0)
|
||||
; AVX512VL-NEXT: fcmovne %st(2), %st
|
||||
; AVX512VL-NEXT: kmovd %k1, %eax
|
||||
; AVX512VL-NEXT: testb $1, %al
|
||||
; AVX512VL-NEXT: fld %st(1)
|
||||
; AVX512VL-NEXT: fcmovne %st(3), %st
|
||||
; AVX512VL-NEXT: kshiftrb $1, %k2, %k0
|
||||
; AVX512VL-NEXT: kmovd %k0, %eax
|
||||
; AVX512VL-NEXT: testb $1, %al
|
||||
; AVX512VL-NEXT: kmovd %k1, %eax
|
||||
; AVX512VL-NEXT: kmovd %k1, %ecx
|
||||
; AVX512VL-NEXT: testb $2, %cl
|
||||
; AVX512VL-NEXT: fld %st(2)
|
||||
; AVX512VL-NEXT: fcmovne %st(4), %st
|
||||
; AVX512VL-NEXT: kmovd %k2, %eax
|
||||
; AVX512VL-NEXT: testb $1, %al
|
||||
; AVX512VL-NEXT: fxch %st(3)
|
||||
; AVX512VL-NEXT: fcmovne %st(4), %st
|
||||
@@ -81,10 +79,10 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
|
||||
; AVX512VL-NEXT: fstpt 10(%rdi)
|
||||
; AVX512VL-NEXT: fxch %st(1)
|
||||
; AVX512VL-NEXT: fadd %st, %st(0)
|
||||
; AVX512VL-NEXT: fstpt (%rdi)
|
||||
; AVX512VL-NEXT: fadd %st, %st(0)
|
||||
; AVX512VL-NEXT: fstpt 20(%rdi)
|
||||
; AVX512VL-NEXT: fadd %st, %st(0)
|
||||
; AVX512VL-NEXT: fstpt (%rdi)
|
||||
; AVX512VL-NEXT: fadd %st, %st(0)
|
||||
; AVX512VL-NEXT: fstpt 60(%rdi)
|
||||
; AVX512VL-NEXT: fadd %st, %st(0)
|
||||
; AVX512VL-NEXT: fstpt 40(%rdi)
|
||||
|
||||
Reference in New Issue
Block a user