[ARM] Revert WLSTP to DLSTP if the target block is out of range

If the block target for a WLSTP instruction is known to be out of range,
and cannot be fixed by the ARMBlockPlacementPass, we can relax it to a
DLSTP (and cmp/branch) to still allow the creation of tail predicated
loops. That is what this patch does, adding extra revert code to the
fallback path of ARMBlockPlacementPass.

Due to the code produced when reverting, this creates a DLSTP between a
Bcc and a Br. As a DLS isn't necessarily a terminator we need to split
the block to move the DLS/Br into.

Differential Revision: https://reviews.llvm.org/D104709
This commit is contained in:
David Green
2021-08-02 10:59:52 +01:00
parent 58cc5a4c9f
commit 2829391840
4 changed files with 136 additions and 75 deletions

View File

@@ -41,6 +41,7 @@ public:
bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other);
bool fixBackwardsWLS(MachineLoop *ML);
bool processPostOrderLoops(MachineLoop *ML);
bool revertWhileToDo(MachineInstr *WLS, MachineLoop *ML);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -82,6 +83,66 @@ static MachineInstr *findWLS(MachineLoop *ML) {
return nullptr;
}
// Revert a WhileLoopStart to an equivalent DoLoopStart and branch. Note that
// because of the branches this requires an extra block to be created.
bool ARMBlockPlacement::revertWhileToDo(MachineInstr *WLS, MachineLoop *ML) {
// lr = t2WhileLoopStartTP r0, r1, TgtBB
// t2Br Ph
// ->
// cmp r0, 0
// brcc TgtBB
// block2:
// LR = t2DoLoopStartTP r0, r1
// t2Br Ph
MachineBasicBlock *Preheader = WLS->getParent();
assert(WLS != &Preheader->back());
assert(WLS->getNextNode() == &Preheader->back());
MachineInstr *Br = &Preheader->back();
assert(Br->getOpcode() == ARM::t2B);
assert(Br->getOperand(1).getImm() == 14);
// Clear the kill flags, as the cmp/bcc will no longer kill any operands.
WLS->getOperand(1).setIsKill(false);
if (WLS->getOpcode() == ARM::t2WhileLoopStartTP)
WLS->getOperand(2).setIsKill(false);
// Create the new block
MachineBasicBlock *NewBlock = Preheader->getParent()->CreateMachineBasicBlock(
Preheader->getBasicBlock());
Preheader->getParent()->insert(++Preheader->getIterator(), NewBlock);
// Move the Br to it
Br->removeFromParent();
NewBlock->insert(NewBlock->end(), Br);
// And setup the successors correctly.
Preheader->replaceSuccessor(Br->getOperand(0).getMBB(), NewBlock);
NewBlock->addSuccessor(Br->getOperand(0).getMBB());
// Create a new DLS to replace the WLS
MachineInstrBuilder MIB =
BuildMI(*NewBlock, Br, WLS->getDebugLoc(),
TII->get(WLS->getOpcode() == ARM::t2WhileLoopStartTP
? ARM::t2DoLoopStartTP
: ARM::t2DoLoopStart));
MIB.add(WLS->getOperand(0));
MIB.add(WLS->getOperand(1));
if (WLS->getOpcode() == ARM::t2WhileLoopStartTP)
MIB.add(WLS->getOperand(2));
LLVM_DEBUG(dbgs() << DEBUG_PREFIX
<< "Reverting While Loop to Do Loop: " << *WLS << "\n");
RevertWhileLoopStartLR(WLS, TII, ARM::t2Bcc, true);
LivePhysRegs LiveRegs;
computeAndAddLiveIns(LiveRegs, *NewBlock);
Preheader->getParent()->RenumberBlocks();
BBUtils->computeAllBlockSizes();
BBUtils->adjustBBOffsetsAfter(Preheader);
return true;
}
/// Checks if loop has a backwards branching WLS, and if possible, fixes it.
/// This requires checking the predecessor (ie. preheader or it's predecessor)
/// for a WLS and if its loopExit/target is before it.
@@ -130,7 +191,7 @@ bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) {
<< "Can't move Predecessor"
"block as it would convert a WLS from forward to a "
"backwards branching WLS\n");
return false;
return revertWhileToDo(WlsInstr, ML);
}
}
}
@@ -225,5 +286,5 @@ void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
F->RenumberBlocks();
BBUtils->computeAllBlockSizes();
BBUtils->adjustBBOffsetsAfter(&F->front());
BBUtils->adjustBBOffsetsAfter(BB);
}

View File

@@ -12,7 +12,7 @@ define i32 @a(i8 zeroext %b, [3 x i8]* nocapture readonly %c, [3 x i32]* nocaptu
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: cmp r0, #1
; CHECK-NEXT: bls.w .LBB0_11
; CHECK-NEXT: bls.w .LBB0_12
; CHECK-NEXT: @ %bb.1: @ %for.body.us.preheader
; CHECK-NEXT: movw r5, :lower16:arr_183
; CHECK-NEXT: movs r3, #0
@@ -31,7 +31,7 @@ define i32 @a(i8 zeroext %b, [3 x i8]* nocapture readonly %c, [3 x i32]* nocaptu
; CHECK-NEXT: @ Child Loop BB0_4 Depth 2
; CHECK-NEXT: @ Child Loop BB0_6 Depth 2
; CHECK-NEXT: @ Child Loop BB0_8 Depth 2
; CHECK-NEXT: @ Child Loop BB0_10 Depth 2
; CHECK-NEXT: @ Child Loop BB0_11 Depth 2
; CHECK-NEXT: ldr.w r0, [r2, r3, lsl #2]
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: ite ne
@@ -87,28 +87,26 @@ define i32 @a(i8 zeroext %b, [3 x i8]* nocapture readonly %c, [3 x i32]* nocaptu
; CHECK-NEXT: add.w r4, r0, #15
; CHECK-NEXT: adds r3, #19
; CHECK-NEXT: lsrs r4, r4, #4
; CHECK-NEXT: subs.w lr, r4, #0
; CHECK-NEXT: cmp.w r4, #0
; CHECK-NEXT: beq .LBB0_2
; CHECK-NEXT: b .LBB0_10
; CHECK-NEXT: .LBB0_10: @ Parent Loop BB0_3 Depth=1
; CHECK-NEXT: @ %bb.10: @ %land.end.us.2
; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: dlstp.8 lr, r0
; CHECK-NEXT: .LBB0_11: @ Parent Loop BB0_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: subs r0, #16
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrbt.8 q3, [r3], #16
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: bne .LBB0_10
; CHECK-NEXT: vstrb.8 q3, [r3], #16
; CHECK-NEXT: letp lr, .LBB0_11
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_11:
; CHECK-NEXT: .LBB0_12:
; CHECK-NEXT: movw r12, :lower16:arr_183
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: movt r12, :upper16:arr_183
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: vmov.i32 q2, #0x0
; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: b .LBB0_13
; CHECK-NEXT: .LBB0_12: @ %for.body.lr.ph.3
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: b .LBB0_14
; CHECK-NEXT: .LBB0_13: @ %for.body.lr.ph.3
; CHECK-NEXT: @ in Loop: Header=BB0_14 Depth=1
; CHECK-NEXT: ldr r3, [r2, #4]
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: ite ne
@@ -117,18 +115,18 @@ define i32 @a(i8 zeroext %b, [3 x i8]* nocapture readonly %c, [3 x i32]* nocaptu
; CHECK-NEXT: add.w r5, r12, r3
; CHECK-NEXT: rsb.w r3, r3, #108
; CHECK-NEXT: add.w r4, r5, #19
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_13
; CHECK-NEXT: b .LBB0_23
; CHECK-NEXT: .LBB0_13: @ %for.cond
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_14
; CHECK-NEXT: b .LBB0_24
; CHECK-NEXT: .LBB0_14: @ %for.cond
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_15 Depth 2
; CHECK-NEXT: @ Child Loop BB0_18 Depth 2
; CHECK-NEXT: @ Child Loop BB0_21 Depth 2
; CHECK-NEXT: @ Child Loop BB0_23 Depth 2
; CHECK-NEXT: @ Child Loop BB0_16 Depth 2
; CHECK-NEXT: @ Child Loop BB0_19 Depth 2
; CHECK-NEXT: @ Child Loop BB0_22 Depth 2
; CHECK-NEXT: @ Child Loop BB0_24 Depth 2
; CHECK-NEXT: cmp r0, #2
; CHECK-NEXT: blo .LBB0_16
; CHECK-NEXT: @ %bb.14: @ %for.body.lr.ph
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: blo .LBB0_17
; CHECK-NEXT: @ %bb.15: @ %for.body.lr.ph
; CHECK-NEXT: @ in Loop: Header=BB0_14 Depth=1
; CHECK-NEXT: ldr r3, [r2, #4]
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: ite ne
@@ -137,17 +135,17 @@ define i32 @a(i8 zeroext %b, [3 x i8]* nocapture readonly %c, [3 x i32]* nocaptu
; CHECK-NEXT: add.w r5, r12, r3
; CHECK-NEXT: rsb.w r3, r3, #108
; CHECK-NEXT: add.w r4, r5, #19
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_16
; CHECK-NEXT: .LBB0_15: @ Parent Loop BB0_13 Depth=1
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_17
; CHECK-NEXT: .LBB0_16: @ Parent Loop BB0_14 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q0, [r4], #16
; CHECK-NEXT: letp lr, .LBB0_15
; CHECK-NEXT: .LBB0_16: @ %for.cond.backedge
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: letp lr, .LBB0_16
; CHECK-NEXT: .LBB0_17: @ %for.cond.backedge
; CHECK-NEXT: @ in Loop: Header=BB0_14 Depth=1
; CHECK-NEXT: cmp r0, #2
; CHECK-NEXT: blo .LBB0_19
; CHECK-NEXT: @ %bb.17: @ %for.body.lr.ph.1
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: blo .LBB0_20
; CHECK-NEXT: @ %bb.18: @ %for.body.lr.ph.1
; CHECK-NEXT: @ in Loop: Header=BB0_14 Depth=1
; CHECK-NEXT: ldr r3, [r2, #4]
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: ite ne
@@ -156,17 +154,17 @@ define i32 @a(i8 zeroext %b, [3 x i8]* nocapture readonly %c, [3 x i32]* nocaptu
; CHECK-NEXT: add.w r5, r12, r3
; CHECK-NEXT: rsb.w r3, r3, #108
; CHECK-NEXT: add.w r4, r5, #19
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_19
; CHECK-NEXT: .LBB0_18: @ Parent Loop BB0_13 Depth=1
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_20
; CHECK-NEXT: .LBB0_19: @ Parent Loop BB0_14 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q1, [r4], #16
; CHECK-NEXT: letp lr, .LBB0_18
; CHECK-NEXT: .LBB0_19: @ %for.cond.backedge.1
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: letp lr, .LBB0_19
; CHECK-NEXT: .LBB0_20: @ %for.cond.backedge.1
; CHECK-NEXT: @ in Loop: Header=BB0_14 Depth=1
; CHECK-NEXT: cmp r0, #2
; CHECK-NEXT: blo .LBB0_22
; CHECK-NEXT: @ %bb.20: @ %for.body.lr.ph.2
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: blo .LBB0_23
; CHECK-NEXT: @ %bb.21: @ %for.body.lr.ph.2
; CHECK-NEXT: @ in Loop: Header=BB0_14 Depth=1
; CHECK-NEXT: ldr r3, [r2, #4]
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: ite ne
@@ -175,21 +173,21 @@ define i32 @a(i8 zeroext %b, [3 x i8]* nocapture readonly %c, [3 x i32]* nocaptu
; CHECK-NEXT: add.w r5, r12, r3
; CHECK-NEXT: rsb.w r3, r3, #108
; CHECK-NEXT: add.w r4, r5, #19
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_22
; CHECK-NEXT: .LBB0_21: @ Parent Loop BB0_13 Depth=1
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_23
; CHECK-NEXT: .LBB0_22: @ Parent Loop BB0_14 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q2, [r4], #16
; CHECK-NEXT: letp lr, .LBB0_21
; CHECK-NEXT: .LBB0_22: @ %for.cond.backedge.2
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: letp lr, .LBB0_22
; CHECK-NEXT: .LBB0_23: @ %for.cond.backedge.2
; CHECK-NEXT: @ in Loop: Header=BB0_14 Depth=1
; CHECK-NEXT: cmp r0, #2
; CHECK-NEXT: blo .LBB0_13
; CHECK-NEXT: b .LBB0_12
; CHECK-NEXT: .LBB0_23: @ Parent Loop BB0_13 Depth=1
; CHECK-NEXT: blo .LBB0_14
; CHECK-NEXT: b .LBB0_13
; CHECK-NEXT: .LBB0_24: @ Parent Loop BB0_14 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q3, [r4], #16
; CHECK-NEXT: letp lr, .LBB0_23
; CHECK-NEXT: b .LBB0_13
; CHECK-NEXT: letp lr, .LBB0_24
; CHECK-NEXT: b .LBB0_14
entry:
%cmp = icmp ugt i8 %b, 1
br i1 %cmp, label %for.body.us.preheader, label %for.cond.preheader

View File

@@ -475,13 +475,13 @@ define void @multilooped_exit(i32 %b) {
; CHECK-NEXT: @ in Loop: Header=BB18_3 Depth=1
; CHECK-NEXT: adds r4, #1
; CHECK-NEXT: cmp.w r4, #1024
; CHECK-NEXT: bge .LBB18_11
; CHECK-NEXT: bge .LBB18_12
; CHECK-NEXT: .LBB18_3: @ %loop
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB18_4 Depth 2
; CHECK-NEXT: @ Child Loop BB18_6 Depth 2
; CHECK-NEXT: @ Child Loop BB18_8 Depth 2
; CHECK-NEXT: @ Child Loop BB18_10 Depth 2
; CHECK-NEXT: @ Child Loop BB18_11 Depth 2
; CHECK-NEXT: movw r3, :lower16:arr_56
; CHECK-NEXT: add.w r1, r0, #15
; CHECK-NEXT: movt r3, :upper16:arr_56
@@ -510,20 +510,17 @@ define void @multilooped_exit(i32 %b) {
; CHECK-NEXT: letp lr, .LBB18_8
; CHECK-NEXT: .LBB18_9: @ %loop
; CHECK-NEXT: @ in Loop: Header=BB18_3 Depth=1
; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: subs.w lr, r12, #0
; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq .LBB18_2
; CHECK-NEXT: b .LBB18_10
; CHECK-NEXT: .LBB18_10: @ Parent Loop BB18_3 Depth=1
; CHECK-NEXT: @ %bb.10: @ %loop
; CHECK-NEXT: @ in Loop: Header=BB18_3 Depth=1
; CHECK-NEXT: dlstp.8 lr, r0
; CHECK-NEXT: .LBB18_11: @ Parent Loop BB18_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.8 r1
; CHECK-NEXT: subs r1, #16
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrbt.8 q0, [r3], #16
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: bne .LBB18_10
; CHECK-NEXT: vstrb.8 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB18_11
; CHECK-NEXT: b .LBB18_2
; CHECK-NEXT: .LBB18_11: @ %exit
; CHECK-NEXT: .LBB18_12: @ %exit
; CHECK-NEXT: pop {r4, pc}
entry:
%cmp8 = icmp sgt i32 %b, 0

View File

@@ -168,24 +168,29 @@ body: |
; CHECK: bb.1:
; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
; CHECK: bb.2:
; CHECK: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; CHECK: successors: %bb.3(0x40000000), %bb.6(0x40000000)
; CHECK: liveins: $r0, $r1, $r2
; CHECK: $lr = t2WhileLoopStartLR killed renamable $r0, %bb.3, implicit-def dead $cpsr
; CHECK: t2B %bb.5, 14 /* CC::al */, $noreg
; CHECK: t2B %bb.6, 14 /* CC::al */, $noreg
; CHECK: bb.3:
; CHECK: successors: %bb.1(0x7c000000), %bb.4(0x04000000)
; CHECK: liveins: $lr, $r0, $r1, $r2
; CHECK: $lr = t2WhileLoopStartLR killed renamable $r0, %bb.1, implicit-def dead $cpsr
; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
; CHECK: t2CMPri renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK: t2Bcc %bb.1, 0 /* CC::eq */, $cpsr
; CHECK: bb.4:
; CHECK: successors: %bb.1(0x40000000), %bb.4(0x40000000)
; CHECK: liveins: $lr, $r1, $r2
; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.4, implicit-def dead $cpsr
; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg
; CHECK: successors: %bb.5(0x80000000)
; CHECK: liveins: $r2, $r1, $r0
; CHECK: $lr = t2DoLoopStart renamable $r0
; CHECK: t2B %bb.5, 14 /* CC::al */, $noreg
; CHECK: bb.5:
; CHECK: successors: %bb.5(0x40000000), %bb.3(0x40000000)
; CHECK: successors: %bb.1(0x40000000), %bb.5(0x40000000)
; CHECK: liveins: $lr, $r1, $r2
; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.5, implicit-def dead $cpsr
; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg
; CHECK: bb.6:
; CHECK: successors: %bb.6(0x40000000), %bb.3(0x40000000)
; CHECK: liveins: $lr, $r1, $r2
; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.6, implicit-def dead $cpsr
; CHECK: t2B %bb.3, 14 /* CC::al */, $noreg
bb.0:
successors: %bb.2(0x80000000)