diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp index 12e9383b6cd4..a5660432603f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp @@ -81,6 +81,7 @@ public: void constrainAsLaneMask(Incoming &In) override; bool lowerTemporalDivergence(); + bool lowerTemporalDivergenceI1(); }; DivergenceLoweringHelper::DivergenceLoweringHelper( @@ -228,6 +229,63 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() { return false; } +bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { + MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)}; + initializeLaneMaskRegisterAttributes(BoolS1); + MachineSSAUpdater SSAUpdater(*MF); + + // In case of use outside muliple nested cycles or muliple uses we only need + // to merge lane mask across largest relevant cycle. + SmallDenseMap> LRCCache; + for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) { + if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + + auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Reg); + auto &CycleMergedMask = LRCCacheIter->getSecond(); + const MachineCycle *&CachedLRC = CycleMergedMask.first; + if (RegNotCached || LRC->contains(CachedLRC)) { + CachedLRC = LRC; + } + } + + for (auto &LRCCacheEntry : LRCCache) { + Register Reg = LRCCacheEntry.first; + auto &CycleMergedMask = LRCCacheEntry.getSecond(); + const MachineCycle *Cycle = CycleMergedMask.first; + + Register MergedMask = MRI->createVirtualRegister(BoolS1); + SSAUpdater.Initialize(MergedMask); + + MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent(); + SSAUpdater.AddAvailableValue(MBB, MergedMask); + + for (auto Entry : Cycle->getEntries()) { + for (MachineBasicBlock *Pred : Entry->predecessors()) { + if (!Cycle->contains(Pred)) { + B.setInsertPt(*Pred, Pred->getFirstTerminator()); + auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {}); + SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0)); + } + } + } + + buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask, + SSAUpdater.GetValueInMiddleOfBlock(MBB), Reg); + + CycleMergedMask.second = MergedMask; + } + + for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) { + if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + + replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache.lookup(Reg).second); + } + + return false; +} + } // End anonymous namespace. INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, @@ -267,6 +325,12 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( // Non-i1 temporal divergence lowering. Changed |= Helper.lowerTemporalDivergence(); + // This covers both uniform and divergent i1s. Lane masks are in sgpr and need + // to be updated in each iteration. + Changed |= Helper.lowerTemporalDivergenceI1(); + // Temporal divergence lowering of divergent i1 phi used outside of the cycle + // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1 + // since in some case lowerPhis does unnecessary lane mask merging. Changed |= Helper.lowerPhis(); return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index 65c96a3db5bb..11acd451d98d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -104,20 +104,25 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) { ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s5, 1 ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: .LBB2_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX10-NEXT: s_xor_b32 s5, s5, 1 -; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: s_and_b32 s8, s5, 1 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 +; GFX10-NEXT: s_cselect_b32 s8, exec_lo, 0 +; GFX10-NEXT: s_add_i32 s6, s6, 1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo +; GFX10-NEXT: s_and_b32 s8, exec_lo, s8 +; GFX10-NEXT: s_or_b32 s7, s7, s8 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cmp_lg_u32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s4, exec_lo, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -147,29 +152,34 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: s_branch .LBB3_2 ; GFX10-NEXT: .LBB3_1: ; %loop_body ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s6 -; GFX10-NEXT: s_xor_b32 s4, s4, exec_lo +; GFX10-NEXT: s_mov_b32 s8, exec_lo ; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v0 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo +; GFX10-NEXT: s_and_b32 s8, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s7, s7, s8 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execz .LBB3_6 ; GFX10-NEXT: .LBB3_2: ; %loop_start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_cmpk_le_i32 s6, 0x3e8 -; GFX10-NEXT: s_mov_b32 s7, 1 +; GFX10-NEXT: s_mov_b32 s8, 1 ; GFX10-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-NEXT: ; %bb.3: ; %else ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: flat_store_dword v[6:7], v1 ; GFX10-NEXT: .LBB3_4: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX10-NEXT: s_xor_b32 s7, s7, 1 -; GFX10-NEXT: s_cmp_lg_u32 s7, 0 +; GFX10-NEXT: s_xor_b32 s8, s8, 1 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.5: ; %if ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 @@ -177,7 +187,7 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-NEXT: s_branch .LBB3_1 ; GFX10-NEXT: .LBB3_6: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir index f244639ed976..c0fbdb541ab9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir @@ -201,20 +201,27 @@ body: | ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C1]](s1), %bb.0, %11(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C1]](s1), %bb.0, %11(s1), %bb.1 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]] - ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32) + ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]] + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32) ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32) + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} @@ -222,7 +229,7 @@ body: | ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s1), [[C5]], [[C4]] + ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C5]], [[C4]] ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32)) ; GFX10-NEXT: SI_RETURN bb.0: @@ -285,17 +292,20 @@ body: | ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C]] ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %37(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C1]](s32), %bb.0 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %17(s32), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %44(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %36(s1), %bb.5 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C1]](s32), %bb.0 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %17(s32), %bb.5 ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000 - ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C2]] + ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI3]](s32), [[C2]] ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: G_BRCOND [[ICMP]](s1), %bb.4 ; GFX10-NEXT: G_BR %bb.2 @@ -303,9 +313,9 @@ body: | ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C3]](s1), %bb.1 + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C3]](s1), %bb.1 ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C4]] + ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C4]] ; GFX10-NEXT: G_BRCOND [[XOR]](s1), %bb.5 ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} @@ -329,12 +339,16 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[COPY9]], [[C8]] - ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32) + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1) + ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32) ; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32) - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1) + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C9]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.6 ; GFX10-NEXT: {{ $}} @@ -342,7 +356,7 @@ body: | ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR1]](s1), [[C11]], [[C10]] + ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C11]], [[C10]] ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32)) ; GFX10-NEXT: SI_RETURN bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index 687452f58bd6..be90b02a6dd6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -13,32 +13,30 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-LABEL: divergent_i1_phi_used_outside_loop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s6, s5, s6 -; GFX10-NEXT: ; implicit-def: $sgpr5 +; GFX10-NEXT: ; implicit-def: $sgpr6 +; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: .LBB0_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1 -; GFX10-NEXT: s_xor_b32 s7, s6, -1 +; GFX10-NEXT: s_xor_b32 s8, s5, -1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 s8, s6, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, s7 -; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, s6 -; GFX10-NEXT: s_or_b32 s7, s8, s7 -; GFX10-NEXT: s_or_b32 s5, s5, s6 -; GFX10-NEXT: s_mov_b32 s6, s7 +; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo +; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 +; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo +; GFX10-NEXT: s_or_b32 s7, s7, s5 +; GFX10-NEXT: s_mov_b32 s5, s8 +; GFX10-NEXT: s_and_b32 s9, exec_lo, s7 +; GFX10-NEXT: s_or_b32 s6, s6, s9 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -141,6 +139,7 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: ; implicit-def: $sgpr6 +; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: .LBB2_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1 @@ -148,9 +147,12 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo +; GFX10-NEXT: s_and_b32 s8, exec_lo, s5 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_or_b32 s7, s7, s8 +; GFX10-NEXT: s_and_b32 s8, exec_lo, s7 +; GFX10-NEXT: s_or_b32 s6, s6, s8 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %exit @@ -186,6 +188,7 @@ define void @divergent_i1_xor_used_outside_loop_twice(float %val, float %pre.con ; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: ; implicit-def: $sgpr6 +; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: .LBB3_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v1 @@ -193,9 +196,12 @@ define void @divergent_i1_xor_used_outside_loop_twice(float %val, float %pre.con ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo +; GFX10-NEXT: s_and_b32 s8, exec_lo, s5 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_or_b32 s7, s7, s8 +; GFX10-NEXT: s_and_b32 s8, exec_lo, s7 +; GFX10-NEXT: s_or_b32 s6, s6, s8 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %exit @@ -248,47 +254,51 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader ; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: ; implicit-def: $sgpr6 -; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: ; implicit-def: $sgpr8 +; GFX10-NEXT: ; implicit-def: $sgpr9 +; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: s_branch .LBB4_3 ; GFX10-NEXT: .LBB4_2: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_xor_b32 s9, s8, -1 -; GFX10-NEXT: s_and_b32 s10, exec_lo, s7 -; GFX10-NEXT: s_or_b32 s5, s10, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s10 +; GFX10-NEXT: s_xor_b32 s10, s9, -1 +; GFX10-NEXT: s_and_b32 s11, exec_lo, s8 +; GFX10-NEXT: s_or_b32 s5, s11, s5 +; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo +; GFX10-NEXT: s_and_b32 s10, exec_lo, s10 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s9, exec_lo, s9 -; GFX10-NEXT: s_or_b32 s6, s6, s9 +; GFX10-NEXT: s_or_b32 s7, s7, s10 +; GFX10-NEXT: s_and_b32 s10, exec_lo, s7 +; GFX10-NEXT: s_or_b32 s6, s6, s10 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execz .LBB4_5 ; GFX10-NEXT: .LBB4_3: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo +; GFX10-NEXT: s_and_b32 s10, exec_lo, -1 ; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo -; GFX10-NEXT: s_and_b32 s9, exec_lo, -1 -; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo -; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: s_or_b32 s9, s9, s10 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6] -; GFX10-NEXT: s_or_b32 s7, s7, s9 +; GFX10-NEXT: s_or_b32 s8, s8, s10 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo ; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s10, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB4_2 ; GFX10-NEXT: ; %bb.4: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5 ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0 +; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo +; GFX10-NEXT: s_and_b32 s11, exec_lo, 0 ; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo -; GFX10-NEXT: s_and_b32 s10, exec_lo, 0 -; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: s_and_b32 s11, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s8, s8, s10 -; GFX10-NEXT: s_or_b32 s7, s7, s11 +; GFX10-NEXT: s_and_b32 s12, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s9, s9, s11 +; GFX10-NEXT: s_or_b32 s8, s8, s12 ; GFX10-NEXT: s_branch .LBB4_2 ; GFX10-NEXT: .LBB4_5: ; %loop.exit.guard ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 @@ -345,52 +355,56 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-LABEL: divergent_i1_icmp_used_outside_loop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: ; implicit-def: $sgpr6 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_branch .LBB5_2 ; GFX10-NEXT: .LBB5_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_and_b32 s4, exec_lo, s7 -; GFX10-NEXT: s_or_b32 s5, s4, s5 -; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s6, s4, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s7, exec_lo, s7 +; GFX10-NEXT: s_or_b32 s4, s7, s4 +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo +; GFX10-NEXT: s_and_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB5_6 ; GFX10-NEXT: .LBB5_2: ; %cond.block.0 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s6, s6, s7 ; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_4 ; GFX10-NEXT: ; %bb.3: ; %if.block.0 ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[4:5] -; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo ; GFX10-NEXT: global_store_dword v[8:9], v4, off ; GFX10-NEXT: .LBB5_4: ; %loop.break.block ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v4 ; GFX10-NEXT: s_mov_b32 s7, -1 ; GFX10-NEXT: ; implicit-def: $vgpr5 -; GFX10-NEXT: s_and_saveexec_b32 s8, s4 +; GFX10-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_1 ; GFX10-NEXT: ; %bb.5: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4 -; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, 0 -; GFX10-NEXT: s_or_b32 s7, s4, s7 +; GFX10-NEXT: s_andn2_b32 s7, -1, exec_lo +; GFX10-NEXT: s_and_b32 s9, exec_lo, 0 +; GFX10-NEXT: s_or_b32 s7, s7, s9 ; GFX10-NEXT: s_branch .LBB5_1 ; GFX10-NEXT: .LBB5_6: ; %cond.block.1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_and_saveexec_b32 s4, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s5 ; GFX10-NEXT: s_cbranch_execz .LBB5_8 ; GFX10-NEXT: ; %bb.7: ; %if.block.1 ; GFX10-NEXT: global_store_dword v[6:7], v4, off @@ -455,48 +469,52 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa ; GFX10-LABEL: divergent_i1_freeze_used_outside_loop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s3, -1 +; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: ; implicit-def: $sgpr3 ; GFX10-NEXT: ; implicit-def: $sgpr1 ; GFX10-NEXT: ; implicit-def: $sgpr2 ; GFX10-NEXT: s_branch .LBB6_2 ; GFX10-NEXT: .LBB6_1: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5 ; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo +; GFX10-NEXT: s_and_b32 s5, exec_lo, s1 +; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo +; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: s_or_b32 s1, s1, s4 +; GFX10-NEXT: s_and_b32 s6, exec_lo, s2 +; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: s_or_b32 s3, s3, s6 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB6_4 ; GFX10-NEXT: .LBB6_2: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 -; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s3 +; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo +; GFX10-NEXT: s_and_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s1, s1, s5 +; GFX10-NEXT: s_and_saveexec_b32 s5, s4 ; GFX10-NEXT: s_cbranch_execz .LBB6_1 ; GFX10-NEXT: ; %bb.3: ; %is.eq.zero ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo +; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6] ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo ; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: ; implicit-def: $sgpr3 +; GFX10-NEXT: s_and_b32 s4, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s1, s1, s4 +; GFX10-NEXT: ; implicit-def: $sgpr4 ; GFX10-NEXT: s_branch .LBB6_1 ; GFX10-NEXT: .LBB6_4: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s3 ; GFX10-NEXT: flat_store_dword v[3:4], v0 ; GFX10-NEXT: s_endpgm entry: @@ -532,36 +550,40 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: ; implicit-def: $sgpr1 -; GFX10-NEXT: ; implicit-def: $sgpr2 ; GFX10-NEXT: ; implicit-def: $sgpr3 +; GFX10-NEXT: ; implicit-def: $sgpr4 +; GFX10-NEXT: ; implicit-def: $sgpr2 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: s_branch .LBB7_2 ; GFX10-NEXT: .LBB7_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 -; GFX10-NEXT: s_or_b32 s0, s4, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s5, exec_lo, s3 +; GFX10-NEXT: s_or_b32 s0, s5, s0 +; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo +; GFX10-NEXT: s_and_b32 s5, exec_lo, s4 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 -; GFX10-NEXT: s_or_b32 s1, s1, s4 +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_and_b32 s5, exec_lo, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-NEXT: .LBB7_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo +; GFX10-NEXT: s_and_b32 s5, exec_lo, -1 ; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_or_b32 s4, s4, s5 ; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] -; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo ; GFX10-NEXT: global_load_dword v9, v[9:10], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB7_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1 @@ -569,14 +591,14 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 -; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo +; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo ; GFX10-NEXT: global_load_dword v9, v[7:8], off -; GFX10-NEXT: s_and_b32 s5, exec_lo, 0 +; GFX10-NEXT: s_and_b32 s6, exec_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v6, v10 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s3, s3, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo +; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s4, s4, s6 +; GFX10-NEXT: s_or_b32 s3, s3, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; GFX10-NEXT: global_store_dword v[7:8], v9, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir index 108286b148f5..e800cb2e24a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir @@ -18,33 +18,31 @@ body: | ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C]] - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1) - ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %22(s1), %bb.1 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C1]](s32), %bb.0 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %11(s32), %bb.1 - ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) - ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %29(s1), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %21(s1), %bb.1 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C1]](s32), %bb.0 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %11(s32), %bb.1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[COPY5]](s1) + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY6]], [[C2]] - ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) - ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32) + ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY5]], [[C2]] + ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32) ; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32) - ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C3]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} @@ -52,7 +50,7 @@ body: | ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C5]], [[C4]] + ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C5]], [[C4]] ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32)) ; GFX10-NEXT: SI_RETURN bb.0: @@ -242,23 +240,30 @@ body: | ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C]] ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %22(s1), %bb.1 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C1]](s32), %bb.0 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %11(s32), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %29(s1), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %21(s1), %bb.1 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C1]](s32), %bb.0 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %11(s32), %bb.1 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY5]], [[C2]] - ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32) + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32) ; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32) - ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C3]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} @@ -266,7 +271,7 @@ body: | ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s1), [[C5]], [[C4]] + ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C5]], [[C4]] ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32)) ; GFX10-NEXT: SI_RETURN bb.0: @@ -329,23 +334,30 @@ body: | ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C]] ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.0, %28(s1), %bb.1 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.1, [[C1]](s32), %bb.0 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %14(s32), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.0, %35(s1), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %27(s1), %bb.1 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.1, [[C1]](s32), %bb.0 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %14(s32), %bb.1 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY7]], [[C2]] - ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32) + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32) ; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32) - ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C3]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} @@ -353,11 +365,11 @@ body: | ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s1), [[C5]], [[C4]] + ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C5]], [[C4]] ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32)) ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.000000e+00 ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 - ; GFX10-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s1), [[C7]], [[C6]] + ; GFX10-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C7]], [[C6]] ; GFX10-NEXT: G_STORE [[SELECT1]](s32), [[MV1]](p0) :: (store (s32)) ; GFX10-NEXT: SI_RETURN bb.0: @@ -437,12 +449,13 @@ body: | ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %38(s1), %bb.8 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %45(s1), %bb.8 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec @@ -451,13 +464,15 @@ body: | ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %60(s1), %bb.7 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %47(s1), %bb.7 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7 - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %67(s1), %bb.7 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %54(s1), %bb.7 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %37(s1), %bb.7 + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1 ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) - ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32) + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) + ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI5]](s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C3]](s32) ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64) @@ -465,16 +480,16 @@ body: | ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C4]] ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1) ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1) + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc - ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc - ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1) ; GFX10-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} @@ -482,16 +497,16 @@ body: | ; GFX10-NEXT: successors: %bb.7(0x80000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C6]] - ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI4]](s32), [[COPY]] - ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI5]], [[C6]] + ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI5]](s32), [[COPY]] + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s1) = G_CONSTANT i1 false - ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1) - ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1) + ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY14]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.7 ; GFX10-NEXT: {{ $}} @@ -508,16 +523,20 @@ body: | ; GFX10-NEXT: bb.7: ; GFX10-NEXT: successors: %bb.8(0x04000000), %bb.3(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4 - ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4 - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3 - ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4 + ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4 + ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3 ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1) ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32) ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY17]], [[C9]] - ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[XOR]](s1) - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY16]](s1), [[PHI3]](s32) + ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY18]], [[C9]] + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY17]](s1), [[PHI4]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY19]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.8 ; GFX10-NEXT: {{ $}} @@ -525,9 +544,9 @@ body: | ; GFX10-NEXT: successors: %bb.2(0x80000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) - ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.2 bb.0: successors: %bb.1(0x40000000), %bb.2(0x40000000) @@ -636,38 +655,45 @@ body: | ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6 - ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PHI1]](s32), implicit $exec_lo + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %29(s1), %bb.6 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[PHI2]](s32), implicit $exec_lo ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI1]] + ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI2]] + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.4(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32) + ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32) ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64) - ; GFX10-NEXT: G_STORE [[PHI1]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_STORE [[PHI2]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: ; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) - ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]] + ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI2]] ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1) - ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1) + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1) + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[COPY9]](s1) ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} @@ -675,21 +701,21 @@ body: | ; GFX10-NEXT: successors: %bb.6(0x80000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]] + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]] ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 false - ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1) - ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1) + ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.6: ; GFX10-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4 - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[COPY9]](s1), %bb.4, [[S_OR_B32_1]](s1), %bb.5 + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4 + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI]](s32) + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), [[PHI1]](s32) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.7 ; GFX10-NEXT: {{ $}} @@ -697,13 +723,13 @@ body: | ; GFX10-NEXT: successors: %bb.8(0x40000000), %bb.9(0x40000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) - ; GFX10-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[S_OR_B32_]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.8 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.8: ; GFX10-NEXT: successors: %bb.9(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: G_STORE [[COPY6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_STORE [[COPY7]](s32), [[MV1]](p1) :: (store (s32), addrspace 1) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.9: ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32) @@ -807,59 +833,66 @@ body: | ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C]](s1) ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C]](s1) + ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %41(s1), %bb.3 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %31(s1), %bb.3 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C1]](s32), %bb.0 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %12(s32), %bb.3 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %48(s1), %bb.3 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %38(s1), %bb.3 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %29(s1), %bb.3 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C1]](s32), %bb.0 + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %12(s32), %bb.3 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]](s1) ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1) + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc - ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.3(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI3]](s32) + ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32) ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64) ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]] - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1) - ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF - ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1) + ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2 - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[PHI1]](s1), %bb.1, [[DEF1]](s1), %bb.2 - ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1) - ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2 + ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[PHI1]](s1), %bb.1, [[DEF2]](s1), %bb.2 + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1) ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) - ; GFX10-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY11]] - ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1) + ; GFX10-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY12]] + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C4]] - ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI3]](s32), [[COPY]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI2]](s32) - ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C4]] + ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI4]](s32), [[COPY]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} @@ -867,7 +900,7 @@ body: | ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FREEZE]](s1), [[C6]], [[C5]] + ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_2]](s1), [[C6]], [[C5]] ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32)) ; GFX10-NEXT: S_ENDPGM 0 bb.0: @@ -947,19 +980,22 @@ body: | ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF - ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %54(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %41(s1), %bb.5 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, %61(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF2]](s1), %bb.0, %48(s1), %bb.5 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.5 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]](s1) - ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI3]](s32) + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) + ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32) ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64) @@ -967,16 +1003,16 @@ body: | ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]] ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) - ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C3]](s1) + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C3]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1) + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc - ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1) + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1) ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} @@ -997,17 +1033,17 @@ body: | ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C6]] ; GFX10-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1) - ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C6]] + ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C6]] ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 - ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI3]](s32), [[C7]] - ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C7]] + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 false - ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C8]](s1) - ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C8]](s1) + ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} @@ -1018,13 +1054,17 @@ body: | ; GFX10-NEXT: bb.5: ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3 - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3 - ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1 - ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1) - ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3 + ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3 + ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1 + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[COPY16]](s1) ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY14]](s1), [[PHI2]](s32) + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.6 ; GFX10-NEXT: {{ $}} @@ -1032,7 +1072,7 @@ body: | ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) - ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY15]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[S_OR_B32_4]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 bb.0: successors: %bb.1(0x80000000) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll index b1cc9ef0c573..e31077dd1986 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll @@ -406,32 +406,36 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: ; implicit-def: $sgpr6 +; GFX10-NEXT: ; implicit-def: $sgpr7 +; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB5_2 ; GFX10-NEXT: .LBB5_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-NEXT: s_and_b32 s1, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s6 ; GFX10-NEXT: s_or_b32 s4, s1, s4 +; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo +; GFX10-NEXT: s_and_b32 s2, exec_lo, s7 +; GFX10-NEXT: s_or_b32 s5, s1, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB5_4 ; GFX10-NEXT: .LBB5_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 -; GFX10-NEXT: s_mov_b32 s7, exec_lo +; GFX10-NEXT: s_mov_b32 s8, exec_lo ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 -; GFX10-NEXT: s_andn2_b32 s1, s6, exec_lo +; GFX10-NEXT: s_andn2_b32 s1, s7, exec_lo ; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: s_and_b32 s6, exec_lo, s7 -; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, exec_lo -; GFX10-NEXT: s_or_b32 s6, s1, s6 +; GFX10-NEXT: s_and_b32 s7, exec_lo, s8 +; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo +; GFX10-NEXT: s_and_b32 s8, exec_lo, exec_lo +; GFX10-NEXT: s_or_b32 s7, s1, s7 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo -; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_or_b32 s6, s6, s8 ; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 @@ -446,13 +450,13 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo -; GFX10-NEXT: s_andn2_b32 s3, s6, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, 0 -; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo +; GFX10-NEXT: s_andn2_b32 s3, s7, exec_lo +; GFX10-NEXT: s_and_b32 s7, exec_lo, 0 +; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: global_load_dword v8, v[6:7], off ; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_or_b32 s6, s3, s6 -; GFX10-NEXT: s_or_b32 s5, s5, s0 +; GFX10-NEXT: s_or_b32 s7, s3, s7 +; GFX10-NEXT: s_or_b32 s6, s6, s0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 @@ -460,7 +464,7 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: s_branch .LBB5_1 ; GFX10-NEXT: .LBB5_4: ; %loop.exit.guard ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s0, s6 +; GFX10-NEXT: s_and_saveexec_b32 s0, s5 ; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB5_6 ; GFX10-NEXT: ; %bb.5: ; %break.body @@ -520,70 +524,71 @@ define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: s_mov_b32 s0, exec_lo -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_and_b32 s3, s0, 1 -; GFX10-NEXT: s_xor_b32 s1, vcc_lo, s0 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_and_b32 s2, s0, 1 +; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-NEXT: ; implicit-def: $sgpr2 +; GFX10-NEXT: s_cselect_b32 s3, exec_lo, 0 ; GFX10-NEXT: s_branch .LBB6_2 ; GFX10-NEXT: .LBB6_1: ; %Flow2 ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_and_b32 s3, exec_lo, s4 -; GFX10-NEXT: s_mov_b32 s1, exec_lo -; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_and_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: s_or_b32 s1, s4, s1 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_cbranch_execz .LBB6_8 ; GFX10-NEXT: .LBB6_2: ; %irr.guard ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB6_6 Depth 2 -; GFX10-NEXT: s_andn2_b32 s4, s0, exec_lo -; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_mov_b32 s3, exec_lo -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_and_saveexec_b32 s4, s1 -; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, s0 +; GFX10-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX10-NEXT: ; %bb.3: ; %.loopexit ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: v_cmp_gt_i32_e64 s1, v5, v0 -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: v_cmp_gt_i32_e64 s0, v5, v0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: s_xor_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo -; GFX10-NEXT: s_or_b32 s5, s1, s5 -; GFX10-NEXT: s_and_b32 s1, exec_lo, s1 -; GFX10-NEXT: s_xor_b32 s5, s5, s6 +; GFX10-NEXT: s_mov_b32 s7, exec_lo +; GFX10-NEXT: s_xor_b32 s6, vcc_lo, s6 ; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_or_b32 s6, s0, s6 +; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_xor_b32 s6, s6, s7 +; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo +; GFX10-NEXT: s_and_b32 s6, exec_lo, s6 +; GFX10-NEXT: s_or_b32 s3, s3, s0 +; GFX10-NEXT: s_or_b32 s4, s4, s6 ; GFX10-NEXT: ; %bb.4: ; %Flow1 ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_mov_b32 s4, exec_lo -; GFX10-NEXT: s_and_saveexec_b32 s5, s3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_andn2_b32 s0, s2, exec_lo +; GFX10-NEXT: s_and_b32 s2, exec_lo, s3 +; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_or_b32 s2, s0, s2 +; GFX10-NEXT: s_and_saveexec_b32 s6, s4 ; GFX10-NEXT: s_cbranch_execz .LBB6_1 ; GFX10-NEXT: ; %bb.5: ; %.preheader ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: v_cmp_le_i32_e64 s1, v4, v0 -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_cmp_le_i32_e64 s0, v4, v0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB6_6: ; %.inner_loop ; GFX10-NEXT: ; Parent Loop BB6_2 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: s_and_b32 s6, exec_lo, s1 -; GFX10-NEXT: s_or_b32 s3, s6, s3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: s_and_b32 s7, exec_lo, s0 +; GFX10-NEXT: s_or_b32 s4, s7, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB6_6 ; GFX10-NEXT: ; %bb.7: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: s_andn2_b32 s1, s4, exec_lo -; GFX10-NEXT: s_and_b32 s3, exec_lo, 0 -; GFX10-NEXT: s_or_b32 s4, s1, s3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_andn2_b32 s0, s5, exec_lo +; GFX10-NEXT: s_and_b32 s4, exec_lo, 0 +; GFX10-NEXT: s_or_b32 s5, s0, s4 ; GFX10-NEXT: s_branch .LBB6_1 ; GFX10-NEXT: .LBB6_8: ; %.exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v3, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v3, s2 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog .entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir index bd7140de5d61..b76d421c1617 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir @@ -771,19 +771,22 @@ body: | ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF - ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %54(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %41(s1), %bb.5 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, %61(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF2]](s1), %bb.0, %48(s1), %bb.5 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.5 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]](s1) - ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI3]](s32) + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) + ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32) ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64) @@ -791,16 +794,16 @@ body: | ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]] ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) - ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C3]](s1) + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C3]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1) + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc - ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1) + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1) ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} @@ -821,17 +824,17 @@ body: | ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C6]] ; GFX10-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1) - ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C6]] + ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C6]] ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 - ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI3]](s32), [[C7]] - ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C7]] + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 false - ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C8]](s1) - ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C8]](s1) + ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} @@ -842,13 +845,17 @@ body: | ; GFX10-NEXT: bb.5: ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3 - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3 - ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1 - ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1) - ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3 + ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3 + ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1 + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[COPY16]](s1) ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY14]](s1), [[PHI2]](s32) + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.6 ; GFX10-NEXT: {{ $}} @@ -856,7 +863,7 @@ body: | ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) - ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY15]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[S_OR_B32_4]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 bb.0: successors: %bb.1(0x80000000) @@ -965,14 +972,18 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %51(s1), %bb.5, %50(s1), %bb.8 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %39(s1), %bb.5, %38(s1), %bb.8 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %52(s1), %bb.5, %51(s1), %bb.8 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %45(s1), %bb.5, %44(s1), %bb.8 ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[COPY9]](s1) ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1) - ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[COPY10]](s1) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %40(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1) + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[COPY11]](s1) ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY8]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} @@ -986,13 +997,13 @@ body: | ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.8(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY10]](s1), %bb.1, %58(s1), %bb.7 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY11]](s1), %bb.1, %59(s1), %bb.7 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI %23(s1), %bb.7, [[DEF]](s1), %bb.1 - ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), %27(s32) - ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[COPY9]](s1) - ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), %27(s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY9]](s1) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.8, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.6 ; GFX10-NEXT: {{ $}} @@ -1008,24 +1019,24 @@ body: | ; GFX10-NEXT: successors: %bb.1(0x80000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]] - ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) ; GFX10-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[ICMP]] ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[FREEZE]], [[C4]] ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR1]] ; GFX10-NEXT: [[XOR2:%[0-9]+]]:_(s1) = G_XOR [[OR]], [[C4]] - ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[XOR2]](s1) - ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %46(s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 %52(s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[XOR2]](s1) + ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %46(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 %53(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.6: ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) - ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[COPY3]], [[COPY2]] + ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[COPY3]], [[COPY2]] ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32) ; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 @@ -1035,32 +1046,29 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32) ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false - ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1) ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.8: ; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY7]](s1), %bb.0, [[COPY14]](s1), %bb.3 - ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.0, [[COPY13]](s1), %bb.3 - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI1]](s1), %bb.3 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY7]](s1), %bb.0, [[COPY15]](s1), %bb.3 + ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.0, [[COPY14]](s1), %bb.3 + ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.3 ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3, [[C1]](s32), %bb.0 - ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI5]](s1) - ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) - ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[COPY19]](s1) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) ; GFX10-NEXT: [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1) ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY21]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc - ; GFX10-NEXT: [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1) - ; GFX10-NEXT: [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1) - ; GFX10-NEXT: [[COPY24:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY23]](s1) - ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[COPY20]](s1) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[COPY22]](s1) + ; GFX10-NEXT: [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1) + ; GFX10-NEXT: [[COPY25:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY24]](s1) + ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY19]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.5 bb.0: successors: %bb.8(0x80000000) @@ -1173,6 +1181,7 @@ body: | ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY4]](s32), [[COPY1]] ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1) ; GFX10-NEXT: G_BR %bb.7 ; GFX10-NEXT: {{ $}} @@ -1186,8 +1195,8 @@ body: | ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %45(s1), %bb.6, %49(s1), %bb.7 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %34(s1), %bb.6, %33(s1), %bb.7 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %52(s1), %bb.6, %56(s1), %bb.7 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %41(s1), %bb.6, %40(s1), %bb.7 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, [[DEF]](s1), %bb.7 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]](s1) ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) @@ -1214,23 +1223,27 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]] + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) ; GFX10-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[ICMP]] ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1) + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1) ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[FREEZE]], [[C2]] ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR]] ; GFX10-NEXT: [[INT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %26(s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF - ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %34(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc + ; GFX10-NEXT: [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc ; GFX10-NEXT: SI_LOOP [[INT2]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.5: ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT2]](s32) - ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[COPY3]], [[COPY2]] + ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_1]](s1), [[COPY3]], [[COPY2]] ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32) ; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 @@ -1240,29 +1253,31 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 false - ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1) - ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %41(s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc - ; GFX10-NEXT: [[DEF3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1) + ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %48(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc + ; GFX10-NEXT: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.7: ; GFX10-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY6]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF2]](s1), %bb.4 - ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INT2]](s32), %bb.4, [[PHI6]](s32), %bb.2, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[C3]](s32), %bb.4, [[INT]](s32), %bb.2, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI4]](s1) - ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY6]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF3]](s1), %bb.4 + ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI6]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 + ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT2]](s32), %bb.4, [[PHI7]](s32), %bb.2, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[C3]](s32), %bb.4, [[INT]](s32), %bb.2, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI4]](s1) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1) - ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY14]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc - ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1) - ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY13]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1) + ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY15]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1) + ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.1 bb.0: successors: %bb.7(0x80000000) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll index 1caecb599ffe..0da25260e231 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll @@ -6,23 +6,27 @@ define void @temporal_divergent_i1_phi(float %val, ptr %addr) { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s6, 1 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s5, 1 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: .LBB0_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s5 -; GFX10-NEXT: s_mov_b32 s7, s6 -; GFX10-NEXT: s_add_i32 s5, s5, 1 -; GFX10-NEXT: s_xor_b32 s6, s6, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX10-NEXT: s_and_b32 s8, s5, 1 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s8, exec_lo, 0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 +; GFX10-NEXT: s_xor_b32 s5, s5, 1 +; GFX10-NEXT: s_add_i32 s6, s6, 1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo +; GFX10-NEXT: s_and_b32 s8, exec_lo, s8 +; GFX10-NEXT: s_or_b32 s7, s7, s8 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cmp_lg_u32 s7, 0 -; GFX10-NEXT: s_cselect_b32 s4, exec_lo, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -51,20 +55,25 @@ define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) { ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s5, 1 ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: .LBB1_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX10-NEXT: s_xor_b32 s5, s5, 1 -; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: s_and_b32 s8, s5, 1 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 +; GFX10-NEXT: s_cselect_b32 s8, exec_lo, 0 +; GFX10-NEXT: s_add_i32 s6, s6, 1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo +; GFX10-NEXT: s_and_b32 s8, exec_lo, s8 +; GFX10-NEXT: s_or_b32 s7, s7, s8 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cmp_lg_u32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s4, exec_lo, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -94,13 +103,14 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: ; implicit-def: $sgpr10 ; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: s_branch .LBB2_3 ; GFX10-NEXT: .LBB2_1: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB2_3 Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: v_mov_b32_e32 v5, s7 -; GFX10-NEXT: s_andn2_b32 s6, s9, exec_lo +; GFX10-NEXT: s_andn2_b32 s6, s10, exec_lo ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo @@ -108,7 +118,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-NEXT: s_add_i32 s4, s4, 1 ; GFX10-NEXT: global_load_dword v6, v[4:5], off ; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s9, s6, s7 +; GFX10-NEXT: s_or_b32 s10, s6, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v6 ; GFX10-NEXT: global_store_dword v[4:5], v6, off @@ -116,8 +126,11 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-NEXT: ; in Loop: Header=BB2_3 Depth=1 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_cselect_b32 s5, exec_lo, 0 -; GFX10-NEXT: s_and_b32 s6, exec_lo, s9 +; GFX10-NEXT: s_and_b32 s6, exec_lo, s10 ; GFX10-NEXT: s_or_b32 s8, s6, s8 +; GFX10-NEXT: s_andn2_b32 s6, s9, exec_lo +; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s9, s6, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execz .LBB2_5 @@ -125,16 +138,16 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_ashr_i32 s5, s4, 31 ; GFX10-NEXT: s_lshl_b64 s[6:7], s[4:5], 2 -; GFX10-NEXT: s_add_u32 s10, s0, s6 -; GFX10-NEXT: s_addc_u32 s11, s1, s7 -; GFX10-NEXT: global_load_dword v4, v3, s[10:11] +; GFX10-NEXT: s_add_u32 s12, s0, s6 +; GFX10-NEXT: s_addc_u32 s13, s1, s7 +; GFX10-NEXT: global_load_dword v4, v3, s[12:13] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s5, v4 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo -; GFX10-NEXT: s_and_b32 s10, exec_lo, exec_lo -; GFX10-NEXT: s_or_b32 s9, s9, s10 +; GFX10-NEXT: s_andn2_b32 s10, s10, exec_lo +; GFX10-NEXT: s_and_b32 s11, exec_lo, exec_lo +; GFX10-NEXT: s_or_b32 s10, s10, s11 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB2_3 Depth=1 @@ -143,7 +156,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-NEXT: s_branch .LBB2_2 ; GFX10-NEXT: .LBB2_5: ; %loop.exit.guard ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_and_saveexec_b32 s0, s5 +; GFX10-NEXT: s_and_saveexec_b32 s0, s9 ; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB2_7 ; GFX10-NEXT: ; %bb.6: ; %break.body @@ -191,29 +204,34 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n. ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB3_2 Depth 2 ; GFX10-NEXT: s_ashr_i32 s7, s6, 31 -; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_mov_b32 s4, s8 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: v_mov_b32_e32 v6, s10 ; GFX10-NEXT: v_mov_b32_e32 v7, s11 ; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: s_mov_b32 s11, 0 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: flat_load_dword v0, v[6:7] ; GFX10-NEXT: .LBB3_2: ; %InnerHeader ; GFX10-NEXT: ; Parent Loop BB3_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v6, s10 -; GFX10-NEXT: s_add_i32 s10, s10, 1 -; GFX10-NEXT: s_xor_b32 s9, s9, exec_lo +; GFX10-NEXT: v_cvt_f32_u32_e32 v6, s11 +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_add_i32 s11, s11, 1 +; GFX10-NEXT: s_xor_b32 s4, s4, s12 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s10, vcc_lo, s10 +; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo +; GFX10-NEXT: s_and_b32 s12, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s9, s9, s12 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s10 ; GFX10-NEXT: s_cbranch_execnz .LBB3_2 ; GFX10-NEXT: ; %bb.3: ; %UseInst ; GFX10-NEXT: ; in Loop: Header=BB3_1 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s10 ; GFX10-NEXT: v_mov_b32_e32 v6, s6 ; GFX10-NEXT: v_mov_b32_e32 v7, s7 ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s6, v1 @@ -277,29 +295,34 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n. ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB4_2 Depth 2 ; GFX10-NEXT: s_ashr_i32 s7, s6, 31 -; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_mov_b32 s4, s8 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: v_mov_b32_e32 v6, s10 ; GFX10-NEXT: v_mov_b32_e32 v7, s11 ; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: s_mov_b32 s11, 0 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: flat_load_dword v0, v[6:7] ; GFX10-NEXT: .LBB4_2: ; %InnerHeader ; GFX10-NEXT: ; Parent Loop BB4_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v6, s10 -; GFX10-NEXT: s_add_i32 s10, s10, 1 -; GFX10-NEXT: s_xor_b32 s9, s9, exec_lo +; GFX10-NEXT: v_cvt_f32_u32_e32 v6, s11 +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_add_i32 s11, s11, 1 +; GFX10-NEXT: s_xor_b32 s4, s4, s12 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s10, vcc_lo, s10 +; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo +; GFX10-NEXT: s_and_b32 s12, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s9, s9, s12 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s10 ; GFX10-NEXT: s_cbranch_execnz .LBB4_2 ; GFX10-NEXT: ; %bb.3: ; %UseInst ; GFX10-NEXT: ; in Loop: Header=BB4_1 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s10 ; GFX10-NEXT: v_mov_b32_e32 v6, s6 ; GFX10-NEXT: v_mov_b32_e32 v7, s7 ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s6, v1 @@ -359,33 +382,38 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i ; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: .LBB5_1: ; %OuterHeader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB5_2 Depth 2 ; GFX10-NEXT: s_ashr_i32 s7, s6, 31 -; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_mov_b32 s4, s8 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s10 ; GFX10-NEXT: v_mov_b32_e32 v9, s11 ; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: s_mov_b32 s11, 0 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo ; GFX10-NEXT: flat_load_dword v0, v[8:9] ; GFX10-NEXT: .LBB5_2: ; %InnerHeader ; GFX10-NEXT: ; Parent Loop BB5_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s10 -; GFX10-NEXT: s_add_i32 s10, s10, 1 -; GFX10-NEXT: s_xor_b32 s9, s9, exec_lo +; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s11 +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_add_i32 s11, s11, 1 +; GFX10-NEXT: s_xor_b32 s4, s4, s12 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s10, vcc_lo, s10 +; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo +; GFX10-NEXT: s_and_b32 s12, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s9, s9, s12 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s10 ; GFX10-NEXT: s_cbranch_execnz .LBB5_2 ; GFX10-NEXT: ; %bb.3: ; %UseInst ; GFX10-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s10 ; GFX10-NEXT: v_mov_b32_e32 v9, s7 ; GFX10-NEXT: v_mov_b32_e32 v8, s6 ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s6, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir index d50e32906c7a..30c1c9f51c62 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir @@ -17,20 +17,27 @@ body: | ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C1]](s1), %bb.0, %11(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C1]](s1), %bb.0, %11(s1), %bb.1 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]] - ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32) + ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]] + ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32) ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32) + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY3]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} @@ -38,7 +45,7 @@ body: | ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI2]](s1), [[C5]], [[C4]] + ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C5]], [[C4]] ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32)) ; GFX10-NEXT: SI_RETURN bb.0: @@ -93,20 +100,27 @@ body: | ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C1]](s1), %bb.0, %11(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C1]](s1), %bb.0, %11(s1), %bb.1 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]] - ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32) + ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]] + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32) ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32) + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} @@ -114,7 +128,7 @@ body: | ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s1), [[C5]], [[C4]] + ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C5]], [[C4]] ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32)) ; GFX10-NEXT: SI_RETURN bb.0: @@ -176,15 +190,18 @@ body: | ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.3(0x50000000), %bb.5(0x30000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %40(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %47(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %37(s1), %bb.5 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) - ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32) + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) + ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI3]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32) ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64) @@ -192,11 +209,11 @@ body: | ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]] ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc - ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) ; GFX10-NEXT: G_BRCOND [[ICMP]](s1), %bb.3 ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} @@ -217,12 +234,12 @@ body: | ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C6]] ; GFX10-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1) - ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C6]] - ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[COPY2]] - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C6]] + ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI3]](s32), [[COPY2]] + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s1) = G_CONSTANT i1 false - ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc - ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} @@ -233,11 +250,15 @@ body: | ; GFX10-NEXT: bb.5: ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3 - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1 - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C7]](s1), %bb.3, [[C3]](s1), %bb.1 - ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32) + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1 + ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C7]](s1), %bb.3, [[C3]](s1), %bb.1 + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1) + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), [[PHI2]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.6 ; GFX10-NEXT: {{ $}} @@ -245,7 +266,7 @@ body: | ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) - ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[S_OR_B32_2]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 bb.0: successors: %bb.1(0x80000000) @@ -357,23 +378,30 @@ body: | ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[MV]], [[SHL]](s64) ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32)) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.3(0x04000000), %bb.2(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.1, %36(s1), %bb.2 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C3]](s32), %bb.1, %22(s32), %bb.2 - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %24(s32), %bb.2, [[C3]](s32), %bb.1 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.1, %43(s1), %bb.2 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.1, %35(s1), %bb.2 + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C3]](s32), %bb.1, %22(s32), %bb.2 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI %24(s32), %bb.2, [[C3]](s32), %bb.1 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY7]], [[C4]] - ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI4]](s32) + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI5]](s32) ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C5]] + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI5]], [[C5]] ; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[LOAD]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI3]](s32) - ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI4]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} @@ -382,7 +410,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[MV1]], [[SEXT]](s64) - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) + ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[S_OR_B32_]](s1) ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: G_STORE [[ZEXT]](s32), [[PTR_ADD1]](p0) :: (store (s8)) ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C6]] @@ -487,23 +515,30 @@ body: | ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[MV]], [[SHL]](s64) ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32)) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.3(0x04000000), %bb.2(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.1, %36(s1), %bb.2 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C3]](s32), %bb.1, %22(s32), %bb.2 - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %24(s32), %bb.2, [[C3]](s32), %bb.1 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.1, %43(s1), %bb.2 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.1, %35(s1), %bb.2 + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C3]](s32), %bb.1, %22(s32), %bb.2 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI %24(s32), %bb.2, [[C3]](s32), %bb.1 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY7]], [[C4]] - ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI4]](s32) + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI5]](s32) ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C5]] + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI5]], [[C5]] ; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[LOAD]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI3]](s32) - ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI4]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} @@ -512,7 +547,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[MV1]], [[SEXT]](s64) - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) + ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[S_OR_B32_]](s1) ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: G_STORE [[ZEXT]](s32), [[PTR_ADD1]](p0) :: (store (s8)) ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C6]] @@ -608,13 +643,15 @@ body: | ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY]](s32), [[C]] ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C1]](s32), %bb.0 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %17(s32), %bb.3 - ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32) + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.3 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C1]](s32), %bb.0 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %17(s32), %bb.3 + ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32) ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[MV]], [[SHL]](s64) @@ -625,18 +662,24 @@ body: | ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.3(0x04000000), %bb.2(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.1, %40(s1), %bb.2 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C3]](s32), %bb.1, %25(s32), %bb.2 - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %27(s32), %bb.2, [[C3]](s32), %bb.1 - ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.1, %48(s1), %bb.2 + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[PHI]](s1), %bb.1, %39(s1), %bb.2 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[C3]](s32), %bb.1, %25(s32), %bb.2 + ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI %27(s32), %bb.2, [[C3]](s32), %bb.1 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY9]], [[C4]] - ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI4]](s32) + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI6]](s32) ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C5]] + ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI6]], [[C5]] ; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[LOAD]] - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI3]](s32) - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI5]](s32) + ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc + ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} @@ -645,18 +688,18 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[MV1]], [[SEXT]](s64) - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) + ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[S_OR_B32_]](s1) ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: G_STORE [[ZEXT]](s32), [[PTR_ADD1]](p0) :: (store (s8)) - ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C6]] - ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[COPY1]] - ; GFX10-NEXT: [[INT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ICMP]](s1), [[PHI]](s32) + ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C6]] + ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[COPY1]] + ; GFX10-NEXT: [[INT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ICMP]](s1), [[PHI1]](s32) ; GFX10-NEXT: SI_LOOP [[INT1]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32) - ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) + ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[S_OR_B32_]](s1) ; GFX10-NEXT: G_STORE [[ZEXT1]](s32), [[MV2]](p0) :: (store (s8)) ; GFX10-NEXT: SI_RETURN bb.0: