[InstCombine] try to canonicalize logical shift after bswap

When shifting by a byte-multiple:
bswap (shl X, C) --> lshr (bswap X), C
bswap (lshr X, C) --> shl (bswap X), C

This is an IR implementation of a transform suggested in D120648.
The "swaps cancel" test models the motivating optimization from
that proposal.

Alive2 checks (as noted in the other review, we could use
knownbits to handle shift-by-variable-amount, but that can be an
enhancement patch):
https://alive2.llvm.org/ce/z/pXUaRf
https://alive2.llvm.org/ce/z/ZnaMLf

Differential Revision: https://reviews.llvm.org/D122010
This commit is contained in:
Sanjay Patel
2022-03-22 09:02:28 -04:00
parent 91ea247039
commit 60820e53ec
2 changed files with 46 additions and 18 deletions

View File

@@ -1349,6 +1349,24 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
Value *IIOperand = II->getArgOperand(0);
Value *X = nullptr;
// Try to canonicalize bswap-of-logical-shift-by-8-bit-multiple as
// inverse-shift-of-bswap:
// bswap (shl X, C) --> lshr (bswap X), C
// bswap (lshr X, C) --> shl (bswap X), C
// TODO: Use knownbits to allow variable shift and non-splat vector match.
BinaryOperator *BO;
if (match(IIOperand, m_OneUse(m_BinOp(BO)))) {
const APInt *C;
if (match(BO, m_LogicalShift(m_Value(X), m_APIntAllowUndef(C))) &&
(*C & 7) == 0) {
Value *NewSwap = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, X);
BinaryOperator::BinaryOps InverseShift =
BO->getOpcode() == Instruction::Shl ? Instruction::LShr
: Instruction::Shl;
return BinaryOperator::Create(InverseShift, NewSwap, BO->getOperand(1));
}
}
KnownBits Known = computeKnownBits(IIOperand, 0, II);
uint64_t LZ = alignDown(Known.countMinLeadingZeros(), 8);
uint64_t TZ = alignDown(Known.countMinTrailingZeros(), 8);

View File

@@ -26,8 +26,8 @@ define i32 @test6(i32 %a) {
define i32 @lshr8_i32(i32 %x) {
; CHECK-LABEL: @lshr8_i32(
; CHECK-NEXT: [[S:%.*]] = lshr i32 [[X:%.*]], 8
; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.bswap.i32(i32 [[S]])
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[X:%.*]])
; CHECK-NEXT: [[R:%.*]] = shl i32 [[TMP1]], 8
; CHECK-NEXT: ret i32 [[R]]
;
%s = lshr i32 %x, 8
@@ -37,8 +37,8 @@ define i32 @lshr8_i32(i32 %x) {
define <2 x i32> @lshr16_v2i32(<2 x i32> %x) {
; CHECK-LABEL: @lshr16_v2i32(
; CHECK-NEXT: [[S:%.*]] = lshr <2 x i32> [[X:%.*]], <i32 16, i32 16>
; CHECK-NEXT: [[R:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[S]])
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[X:%.*]])
; CHECK-NEXT: [[R:%.*]] = shl <2 x i32> [[TMP1]], <i32 16, i32 16>
; CHECK-NEXT: ret <2 x i32> [[R]]
;
%s = lshr <2 x i32> %x, <i32 16, i32 16>
@@ -48,14 +48,16 @@ define <2 x i32> @lshr16_v2i32(<2 x i32> %x) {
define i32 @lshr24_i32(i32 %x) {
; CHECK-LABEL: @lshr24_i32(
; CHECK-NEXT: [[S:%.*]] = and i32 [[X:%.*]], -16777216
; CHECK-NEXT: ret i32 [[S]]
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], -16777216
; CHECK-NEXT: ret i32 [[TMP1]]
;
%s = lshr i32 %x, 24
%r = call i32 @llvm.bswap.i32(i32 %s)
ret i32 %r
}
; negative test - need shift-by-8-bit-multiple
define i32 @lshr12_i32(i32 %x) {
; CHECK-LABEL: @lshr12_i32(
; CHECK-NEXT: [[S:%.*]] = lshr i32 [[X:%.*]], 12
@@ -67,6 +69,8 @@ define i32 @lshr12_i32(i32 %x) {
ret i32 %r
}
; negative test - uses
define i32 @lshr8_i32_use(i32 %x, i32* %p) {
; CHECK-LABEL: @lshr8_i32_use(
; CHECK-NEXT: [[S:%.*]] = lshr i32 [[X:%.*]], 12
@@ -82,8 +86,8 @@ define i32 @lshr8_i32_use(i32 %x, i32* %p) {
define i64 @shl16_i64(i64 %x) {
; CHECK-LABEL: @shl16_i64(
; CHECK-NEXT: [[S:%.*]] = shl i64 [[X:%.*]], 16
; CHECK-NEXT: [[R:%.*]] = call i64 @llvm.bswap.i64(i64 [[S]])
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[X:%.*]])
; CHECK-NEXT: [[R:%.*]] = lshr i64 [[TMP1]], 16
; CHECK-NEXT: ret i64 [[R]]
;
%s = shl i64 %x, 16
@@ -91,10 +95,12 @@ define i64 @shl16_i64(i64 %x) {
ret i64 %r
}
; poison vector element propagates
define <2 x i64> @shl16_v2i64(<2 x i64> %x) {
; CHECK-LABEL: @shl16_v2i64(
; CHECK-NEXT: [[S:%.*]] = shl <2 x i64> [[X:%.*]], <i64 poison, i64 24>
; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[S]])
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[X:%.*]])
; CHECK-NEXT: [[R:%.*]] = lshr <2 x i64> [[TMP1]], <i64 poison, i64 24>
; CHECK-NEXT: ret <2 x i64> [[R]]
;
%s = shl <2 x i64> %x, <i64 poison, i64 24>
@@ -104,14 +110,16 @@ define <2 x i64> @shl16_v2i64(<2 x i64> %x) {
define i64 @shl56_i64(i64 %x) {
; CHECK-LABEL: @shl56_i64(
; CHECK-NEXT: [[S:%.*]] = and i64 [[X:%.*]], 255
; CHECK-NEXT: ret i64 [[S]]
; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], 255
; CHECK-NEXT: ret i64 [[TMP1]]
;
%s = shl i64 %x, 56
%r = call i64 @llvm.bswap.i64(i64 %s)
ret i64 %r
}
; negative test - need shift-by-8-bit-multiple
define i64 @shl42_i64(i64 %x) {
; CHECK-LABEL: @shl42_i64(
; CHECK-NEXT: [[S:%.*]] = shl i64 [[X:%.*]], 42
@@ -123,6 +131,8 @@ define i64 @shl42_i64(i64 %x) {
ret i64 %r
}
; negative test - uses
define i32 @shl8_i32_use(i32 %x, i32* %p) {
; CHECK-LABEL: @shl8_i32_use(
; CHECK-NEXT: [[S:%.*]] = shl i32 [[X:%.*]], 8
@@ -136,11 +146,11 @@ define i32 @shl8_i32_use(i32 %x, i32* %p) {
ret i32 %r
}
; swaps cancel
define i64 @swap_shl16_i64(i64 %x) {
; CHECK-LABEL: @swap_shl16_i64(
; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.bswap.i64(i64 [[X:%.*]])
; CHECK-NEXT: [[S:%.*]] = shl i64 [[B]], 16
; CHECK-NEXT: [[R:%.*]] = call i64 @llvm.bswap.i64(i64 [[S]])
; CHECK-NEXT: [[R:%.*]] = lshr i64 [[X:%.*]], 16
; CHECK-NEXT: ret i64 [[R]]
;
%b = call i64 @llvm.bswap.i64(i64 %x)
@@ -536,11 +546,11 @@ define <2 x i64> @bs_active_high_different_negative(<2 x i64> %0) {
ret <2 x i64> %3
}
; negative test
; TODO: This should fold to 'and'.
define <2 x i64> @bs_active_high_undef(<2 x i64> %0) {
; CHECK-LABEL: @bs_active_high_undef(
; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP0:%.*]], <i64 56, i64 undef>
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP2]])
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP0:%.*]])
; CHECK-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], <i64 56, i64 undef>
; CHECK-NEXT: ret <2 x i64> [[TMP3]]
;
%2 = shl <2 x i64> %0, <i64 56, i64 undef>