Files
clang-p2996/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
Christudasan Devadasan 7a98f084c4 [AMDGPU][SILowerSGPRSpills] Spill SGPRs to virtual VGPRs
Currently, the custom SGPR spill lowering pass spills
SGPRs into physical VGPR lanes and the remaining VGPRs
are used by regalloc for vector regclass allocation.
This imposes many restrictions that we ended up with
unsuccessful SGPR spilling when there won't be enough
VGPRs and we are forced to spill the leftover into
memory during PEI. The custom spill handling during PEI
has many edge cases and often breaks the compiler time
to time.

This patch implements spilling SGPRs into virtual VGPR
lanes. Since we now split the register allocation for
SGPRs and VGPRs, the virtual registers introduced for
the spill lanes would get allocated automatically in
the subsequent regalloc invocation for VGPRs.

Spill to virtual registers will always be successful,
even in the high-pressure situations, and hence it avoids
most of the edge cases during PEI. We are now left with
only the custom SGPR spills during PEI for special registers
like the frame pointer which is an unproblematic case.

Differential Revision: https://reviews.llvm.org/D124196
2023-07-07 23:14:32 +05:30

1430 lines
64 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Disabled endcf collapse at -O0.
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -O0 -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s
; Note: Breaking large PHIs is disabled to branches from being eliminated (in scc_liveness)
define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: simple_nested_if:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB0_3
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_and_b64 exec, exec, vcc
; GCN-NEXT: s_cbranch_execz .LBB0_3
; GCN-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: v_mov_b32_e32 v2, 1
; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4
; GCN-NEXT: .LBB0_3: ; %bb.outer.end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: simple_nested_if:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB0_4
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB0_3
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
; GCN-O0-NEXT: s_mov_b32 s2, 2
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT: .LBB0_3: ; %Flow
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v1, v2
; GCN-O0-NEXT: ; kill: killed $vgpr0
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = icmp ugt i32 %tmp, 1
br i1 %tmp1, label %bb.outer.then, label %bb.outer.end
bb.outer.then: ; preds = %bb
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp4, align 4
%tmp5 = icmp eq i32 %tmp, 2
br i1 %tmp5, label %bb.outer.end, label %bb.inner.then
bb.inner.then: ; preds = %bb.outer.then
%tmp7 = add i32 %tmp, 1
%tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
store i32 1, ptr addrspace(1) %tmp9, align 4
br label %bb.outer.end
bb.outer.end: ; preds = %bb.outer.then, %bb.inner.then, %bb
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: uncollapsable_nested_if:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_4
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_3
; GCN-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4
; GCN-NEXT: .LBB1_3: ; %bb.inner.end
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 2
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-NEXT: .LBB1_4: ; %Flow
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: uncollapsable_nested_if:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB1_3
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB1_4
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
; GCN-O0-NEXT: s_mov_b32 s2, 2
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB1_4
; GCN-O0-NEXT: .LBB1_3: ; %Flow
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_branch .LBB1_5
; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s2, v0, 4
; GCN-O0-NEXT: v_readlane_b32 s3, v0, 5
; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB1_3
; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v1, v2
; GCN-O0-NEXT: ; kill: killed $vgpr0
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = icmp ugt i32 %tmp, 1
br i1 %tmp1, label %bb.outer.then, label %bb.outer.end
bb.outer.then: ; preds = %bb
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp4, align 4
%tmp5 = icmp eq i32 %tmp, 2
br i1 %tmp5, label %bb.inner.end, label %bb.inner.then
bb.inner.then: ; preds = %bb.outer.then
%tmp7 = add i32 %tmp, 1
%tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
store i32 1, ptr addrspace(1) %tmp8, align 4
br label %bb.inner.end
bb.inner.end: ; preds = %bb.inner.then, %bb.outer.then
%tmp9 = add i32 %tmp, 2
%tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp9
store i32 2, ptr addrspace(1) %tmp10, align 4
br label %bb.outer.end
bb.outer.end: ; preds = %bb.inner.then, %bb
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: nested_if_if_else:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GCN-NEXT: s_cbranch_execz .LBB2_5
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-NEXT: v_mov_b32_e32 v4, s1
; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v1
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB2_3
; GCN-NEXT: ; %bb.2: ; %bb.else
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mov_b32_e32 v0, 2
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8
; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4
; GCN-NEXT: .LBB2_3: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB2_5
; GCN-NEXT: ; %bb.4: ; %bb.then
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:4
; GCN-NEXT: .LBB2_5: ; %bb.outer.end
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v2, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: nested_if_if_else:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s4, 2
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s4
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_6
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4
; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_2
; GCN-O0-NEXT: s_branch .LBB2_4
; GCN-O0-NEXT: .LBB2_2: ; %Flow
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5
; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_5
; GCN-O0-NEXT: ; %bb.3: ; %bb.then
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
; GCN-O0-NEXT: s_mov_b32 s2, 2
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB2_5
; GCN-O0-NEXT: .LBB2_4: ; %bb.else
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB2_2
; GCN-O0-NEXT: .LBB2_5: ; %Flow1
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v1, v2
; GCN-O0-NEXT: ; kill: killed $vgpr0
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp1, align 4
%tmp2 = icmp ugt i32 %tmp, 1
br i1 %tmp2, label %bb.outer.then, label %bb.outer.end
bb.outer.then: ; preds = %bb
%tmp5 = icmp eq i32 %tmp, 2
br i1 %tmp5, label %bb.then, label %bb.else
bb.then: ; preds = %bb.outer.then
%tmp3 = add i32 %tmp, 1
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp3
store i32 1, ptr addrspace(1) %tmp4, align 4
br label %bb.outer.end
bb.else: ; preds = %bb.outer.then
%tmp7 = add i32 %tmp, 2
%tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
store i32 2, ptr addrspace(1) %tmp9, align 4
br label %bb.outer.end
bb.outer.end: ; preds = %bb, %bb.then, %bb.else
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: nested_if_else_if:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 2, v0
; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB3_4
; GCN-NEXT: ; %bb.1: ; %bb.outer.else
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: v_mov_b32_e32 v3, 3
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:12
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_cbranch_execz .LBB3_3
; GCN-NEXT: ; %bb.2: ; %bb.inner.then2
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s8, s10
; GCN-NEXT: s_mov_b32 s9, s10
; GCN-NEXT: v_mov_b32_e32 v0, 4
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 offset:16
; GCN-NEXT: .LBB3_3: ; %Flow
; GCN-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: .LBB3_4: ; %Flow2
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB3_8
; GCN-NEXT: ; %bb.5: ; %bb.outer.then
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, 1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:4
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB3_7
; GCN-NEXT: ; %bb.6: ; %bb.inner.then
; GCN-NEXT: v_mov_b32_e32 v0, 2
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-NEXT: .LBB3_7: ; %Flow1
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: .LBB3_8: ; %bb.outer.end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: nested_if_else_if:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: s_mov_b32 s1, s0
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s1
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v6, v4
; GCN-O0-NEXT: v_add_i32_e64 v5, s[2:3], s2, v2
; GCN-O0-NEXT: v_mov_b32_e32 v2, s1
; GCN-O0-NEXT: v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3]
; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v6, v2
; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_1
; GCN-O0-NEXT: s_branch .LBB3_4
; GCN-O0-NEXT: .LBB3_1: ; %Flow2
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_8
; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_7
; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then
; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-O0-NEXT: s_branch .LBB3_7
; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s4, s0
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_6
; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2
; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, 4
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16
; GCN-O0-NEXT: .LBB3_6: ; %Flow
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_branch .LBB3_1
; GCN-O0-NEXT: .LBB3_7: ; %Flow1
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v1, v2
; GCN-O0-NEXT: ; kill: killed $vgpr0
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp1, align 4
%cc1 = icmp ugt i32 %tmp, 1
br i1 %cc1, label %bb.outer.then, label %bb.outer.else
bb.outer.then:
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 1
store i32 1, ptr addrspace(1) %tmp2, align 4
%cc2 = icmp eq i32 %tmp, 2
br i1 %cc2, label %bb.inner.then, label %bb.outer.end
bb.inner.then:
%tmp3 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 2
store i32 2, ptr addrspace(1) %tmp3, align 4
br label %bb.outer.end
bb.outer.else:
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 3
store i32 3, ptr addrspace(1) %tmp4, align 4
%cc3 = icmp eq i32 %tmp, 0 ; avoid being optimized away through the domination
br i1 %cc3, label %bb.inner.then2, label %bb.outer.end
bb.inner.then2:
%tmp5 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 4
store i32 4, ptr addrspace(1) %tmp5, align 4
br label %bb.outer.end
bb.outer.end:
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: s_endpgm_unsafe_barrier:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GCN-NEXT: s_cbranch_execz .LBB4_2
; GCN-NEXT: ; %bb.1: ; %bb.then
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v1, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: .LBB4_2: ; %bb.end
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_barrier
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: s_endpgm_unsafe_barrier:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB4_2
; GCN-O0-NEXT: ; %bb.1: ; %bb.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v1, v2
; GCN-O0-NEXT: s_mov_b32 s4, 2
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[0:1], s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT: .LBB4_2: ; %bb.end
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_barrier
; GCN-O0-NEXT: ; kill: killed $vgpr0
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = icmp ugt i32 %tmp, 1
br i1 %tmp1, label %bb.then, label %bb.end
bb.then: ; preds = %bb
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp4, align 4
br label %bb.end
bb.end: ; preds = %bb.then, %bb
call void @llvm.amdgcn.s.barrier()
ret void
}
define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-LABEL: scc_liveness:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_movk_i32 s4, 0x207
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GCN-NEXT: s_mov_b32 s8, 0
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GCN-NEXT: s_mov_b64 s[12:13], 0
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_branch .LBB5_3
; GCN-NEXT: .LBB5_1: ; %Flow
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN-NEXT: .LBB5_2: ; %bb10
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[14:15]
; GCN-NEXT: s_and_b64 s[6:7], exec, s[4:5]
; GCN-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13]
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_andn2_b64 exec, exec, s[12:13]
; GCN-NEXT: s_cbranch_execz .LBB5_7
; GCN-NEXT: .LBB5_3: ; %bb1
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_and_b64 s[10:11], exec, vcc
; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GCN-NEXT: s_cbranch_execnz .LBB5_3
; GCN-NEXT: ; %bb.4: ; %bb2
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_mov_b32 s10, s8
; GCN-NEXT: s_mov_b32 s11, s8
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_and_saveexec_b64 s[14:15], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB5_2
; GCN-NEXT: ; %bb.5: ; %bb4
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[6:7]
; GCN-NEXT: s_cbranch_execz .LBB5_1
; GCN-NEXT: ; %bb.6: ; %bb8
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_branch .LBB5_1
; GCN-NEXT: .LBB5_7: ; %bb12
; GCN-NEXT: s_or_b64 exec, exec, s[12:13]
; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: scc_liveness:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], 0
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1
; GCN-O0-NEXT: v_writelane_b32 v0, s4, 2
; GCN-O0-NEXT: v_writelane_b32 v0, s5, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: .LBB5_1: ; %bb1
; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_readlane_b32 s8, v0, 2
; GCN-O0-NEXT: v_readlane_b32 s9, v0, 3
; GCN-O0-NEXT: v_readlane_b32 s6, v0, 0
; GCN-O0-NEXT: v_readlane_b32 s7, v0, 1
; GCN-O0-NEXT: v_writelane_b32 v0, s6, 4
; GCN-O0-NEXT: v_writelane_b32 v0, s7, 5
; GCN-O0-NEXT: s_mov_b32 s4, 0x207
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, s4
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
; GCN-O0-NEXT: v_writelane_b32 v0, s4, 6
; GCN-O0-NEXT: v_writelane_b32 v0, s5, 7
; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v0, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v0, s7, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
; GCN-O0-NEXT: ; %bb.2: ; %bb2
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_mov_b32 s6, 0
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v1, s6
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s6
; GCN-O0-NEXT: v_writelane_b32 v0, s4, 8
; GCN-O0-NEXT: v_writelane_b32 v0, s5, 9
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: s_mov_b32 s8, s4
; GCN-O0-NEXT: s_mov_b32 s9, s4
; GCN-O0-NEXT: s_mov_b32 s10, s4
; GCN-O0-NEXT: s_mov_b32 s11, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s8
; GCN-O0-NEXT: v_mov_b32_e32 v2, s9
; GCN-O0-NEXT: v_mov_b32_e32 v3, s10
; GCN-O0-NEXT: v_mov_b32_e32 v4, s11
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s4, 10
; GCN-O0-NEXT: v_writelane_b32 v0, s5, 11
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execz .LBB5_5
; GCN-O0-NEXT: ; %bb.3: ; %bb4
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s4
; GCN-O0-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v1, s4
; GCN-O0-NEXT: s_mov_b32 s8, s4
; GCN-O0-NEXT: s_mov_b32 s9, s4
; GCN-O0-NEXT: s_mov_b32 s10, s4
; GCN-O0-NEXT: s_mov_b32 s11, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s8
; GCN-O0-NEXT: v_mov_b32_e32 v2, s9
; GCN-O0-NEXT: v_mov_b32_e32 v3, s10
; GCN-O0-NEXT: v_mov_b32_e32 v4, s11
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s4, 12
; GCN-O0-NEXT: v_writelane_b32 v0, s5, 13
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execz .LBB5_6
; GCN-O0-NEXT: ; %bb.4: ; %bb8
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_mov_b32 s10, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: ; implicit-def: $sgpr5
; GCN-O0-NEXT: ; implicit-def: $sgpr9
; GCN-O0-NEXT: ; implicit-def: $sgpr5
; GCN-O0-NEXT: ; implicit-def: $sgpr8
; GCN-O0-NEXT: ; implicit-def: $sgpr5
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b32 s5, s10
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_6
; GCN-O0-NEXT: .LBB5_5: ; %Flow2
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 10
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 11
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_7
; GCN-O0-NEXT: .LBB5_6: ; %Flow
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_5
; GCN-O0-NEXT: .LBB5_7: ; %bb10
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: s_waitcnt expcnt(3)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s6, v0, 8
; GCN-O0-NEXT: v_readlane_b32 s7, v0, 9
; GCN-O0-NEXT: s_mov_b64 s[4:5], -1
; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14
; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s4, 16
; GCN-O0-NEXT: v_writelane_b32 v0, s5, 17
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execz .LBB5_9
; GCN-O0-NEXT: ; %bb.8: ; %Flow1
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_mov_b64 s[4:5], 0
; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14
; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: .LBB5_9: ; %Flow3
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s8, v4, 16
; GCN-O0-NEXT: v_readlane_b32 s9, v4, 17
; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-O0-NEXT: v_readlane_b32 s6, v4, 4
; GCN-O0-NEXT: v_readlane_b32 s7, v4, 5
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 14
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 15
; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5]
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0
; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v4, s8, 0
; GCN-O0-NEXT: v_writelane_b32 v4, s9, 1
; GCN-O0-NEXT: v_writelane_b32 v4, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v4, s7, 3
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v4, s6, 18
; GCN-O0-NEXT: v_writelane_b32 v4, s7, 19
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
; GCN-O0-NEXT: ; %bb.10: ; %bb12
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: s_waitcnt expcnt(3)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 18
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 19
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: ; %bb.11: ; %bb12
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v5, v4
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v5, v3
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v5, v2
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: ; kill: killed $vgpr0
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
bb:
br label %bb1
bb1: ; preds = %Flow1, %bb1, %bb
%tmp = icmp slt i32 %arg, 519
br i1 %tmp, label %bb2, label %bb1
bb2: ; preds = %bb1
%tmp3 = icmp eq i32 %arg, 0
br i1 %tmp3, label %bb4, label %bb10
bb4: ; preds = %bb2
%tmp6 = load float, ptr addrspace(5) undef
%tmp7 = fcmp olt float %tmp6, 0.0
br i1 %tmp7, label %bb8, label %Flow
bb8: ; preds = %bb4
%tmp9 = insertelement <4 x float> undef, float 0.0, i32 1
br label %Flow
Flow: ; preds = %bb8, %bb4
%tmp8 = phi <4 x float> [ %tmp9, %bb8 ], [ zeroinitializer, %bb4 ]
br label %bb10
bb10: ; preds = %Flow, %bb2
%tmp11 = phi <4 x float> [ zeroinitializer, %bb2 ], [ %tmp8, %Flow ]
br i1 %tmp3, label %bb12, label %Flow1
Flow1: ; preds = %bb10
br label %bb1
bb12: ; preds = %bb10
store volatile <4 x float> %tmp11, ptr addrspace(5) undef, align 16
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare void @llvm.amdgcn.s.barrier() #1
attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind convergent }
attributes #2 = { nounwind }