[MachineLICM] Correctly Apply Register Masks (#95746)
Fix regression introduced in d4b8b72
This commit is contained in:
committed by
GitHub
parent
c2d9f253e5
commit
770393bb99
@@ -426,38 +426,29 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
|
||||
static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
|
||||
BitVector &RUs,
|
||||
const uint32_t *Mask) {
|
||||
// Iterate over the RegMask raw to avoid constructing a BitVector, which is
|
||||
// expensive as it implies dynamically allocating memory.
|
||||
//
|
||||
// We also work backwards.
|
||||
BitVector ClobberedRUs(TRI.getNumRegUnits(), true);
|
||||
const unsigned NumRegs = TRI.getNumRegs();
|
||||
const unsigned MaskWords = (NumRegs + 31) / 32;
|
||||
for (unsigned K = 0; K < MaskWords; ++K) {
|
||||
// We want to set the bits that aren't in RegMask, so flip it.
|
||||
uint32_t Word = ~Mask[K];
|
||||
|
||||
// Iterate all set bits, starting from the right.
|
||||
while (Word) {
|
||||
const unsigned SetBitIdx = countr_zero(Word);
|
||||
|
||||
// The bits are numbered from the LSB in each word.
|
||||
const unsigned PhysReg = (K * 32) + SetBitIdx;
|
||||
|
||||
// Clear the bit at SetBitIdx. Doing it this way appears to generate less
|
||||
// instructions on x86. This works because negating a number will flip all
|
||||
// the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but
|
||||
// faster.
|
||||
Word ^= Word & -Word;
|
||||
const uint32_t Word = Mask[K];
|
||||
if (!Word)
|
||||
continue;
|
||||
|
||||
for (unsigned Bit = 0; Bit < 32; ++Bit) {
|
||||
const unsigned PhysReg = (K * 32) + Bit;
|
||||
if (PhysReg == NumRegs)
|
||||
return;
|
||||
break;
|
||||
|
||||
if (PhysReg) {
|
||||
// Check if we have a valid PhysReg that is set in the mask.
|
||||
// FIXME: We shouldn't have to check for PhysReg.
|
||||
if (PhysReg && ((Word >> Bit) & 1)) {
|
||||
for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
|
||||
RUs.set(*RUI);
|
||||
ClobberedRUs.reset(*RUI);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
RUs |= ClobberedRUs;
|
||||
}
|
||||
|
||||
/// Examine the instruction for potentai LICM candidate. Also
|
||||
|
||||
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
|
||||
; GCN-NEXT: v_writelane_b32 v40, s62, 30
|
||||
; GCN-NEXT: v_writelane_b32 v40, s63, 31
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], exec
|
||||
; GCN-NEXT: s_movk_i32 s4, 0x7b
|
||||
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s8, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s9, v1
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
|
||||
; GCN-NEXT: s_movk_i32 s4, 0x7b
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
||||
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
|
||||
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s62, 30
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s63, 31
|
||||
; GISEL-NEXT: s_mov_b64 s[6:7], exec
|
||||
; GISEL-NEXT: s_movk_i32 s4, 0x7b
|
||||
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s9, v1
|
||||
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
|
||||
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
|
||||
; GISEL-NEXT: s_movk_i32 s4, 0x7b
|
||||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
||||
; GISEL-NEXT: ; implicit-def: $vgpr0
|
||||
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]
|
||||
|
||||
Reference in New Issue
Block a user