Files
clang-p2996/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
Vitaly Buka a496c8be6e Revert "[CodeGen]Allow targets to use target specific COPY instructions for live range splitting"
And dependent commits.

Details in D150388.

This reverts commit 825b7f0ca5.
This reverts commit 7a98f084c4.
This reverts commit b4a62b1fa5.
This reverts commit b7836d8562.

No conflicts in the code, few tests had conflicts in autogenerated CHECKs:
llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll

Reviewed By: alexfh

Differential Revision: https://reviews.llvm.org/D156381
2023-07-26 22:13:32 -07:00

1203 lines
52 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Disabled endcf collapse at -O0.
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -O0 -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s
; Note: Breaking large PHIs is disabled to branches from being eliminated (in scc_liveness)
define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: simple_nested_if:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB0_3
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_and_b64 exec, exec, vcc
; GCN-NEXT: s_cbranch_execz .LBB0_3
; GCN-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: v_mov_b32_e32 v2, 1
; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4
; GCN-NEXT: .LBB0_3: ; %bb.outer.end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: simple_nested_if:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB0_4
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v0
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB0_3
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s2, 2
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: .LBB0_3: ; %Flow
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v0, v2
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = icmp ugt i32 %tmp, 1
br i1 %tmp1, label %bb.outer.then, label %bb.outer.end
bb.outer.then: ; preds = %bb
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp4, align 4
%tmp5 = icmp eq i32 %tmp, 2
br i1 %tmp5, label %bb.outer.end, label %bb.inner.then
bb.inner.then: ; preds = %bb.outer.then
%tmp7 = add i32 %tmp, 1
%tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
store i32 1, ptr addrspace(1) %tmp9, align 4
br label %bb.outer.end
bb.outer.end: ; preds = %bb.outer.then, %bb.inner.then, %bb
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: uncollapsable_nested_if:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_4
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_3
; GCN-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4
; GCN-NEXT: .LBB1_3: ; %bb.inner.end
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 2
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-NEXT: .LBB1_4: ; %Flow
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: uncollapsable_nested_if:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB1_3
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v0
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB1_4
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s2, 2
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB1_4
; GCN-O0-NEXT: .LBB1_3: ; %Flow
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_branch .LBB1_5
; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s2, v1, 4
; GCN-O0-NEXT: v_readlane_b32 s3, v1, 5
; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], v0
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB1_3
; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v0, v2
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = icmp ugt i32 %tmp, 1
br i1 %tmp1, label %bb.outer.then, label %bb.outer.end
bb.outer.then: ; preds = %bb
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp4, align 4
%tmp5 = icmp eq i32 %tmp, 2
br i1 %tmp5, label %bb.inner.end, label %bb.inner.then
bb.inner.then: ; preds = %bb.outer.then
%tmp7 = add i32 %tmp, 1
%tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
store i32 1, ptr addrspace(1) %tmp8, align 4
br label %bb.inner.end
bb.inner.end: ; preds = %bb.inner.then, %bb.outer.then
%tmp9 = add i32 %tmp, 2
%tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp9
store i32 2, ptr addrspace(1) %tmp10, align 4
br label %bb.outer.end
bb.outer.end: ; preds = %bb.inner.then, %bb
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: nested_if_if_else:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GCN-NEXT: s_cbranch_execz .LBB2_5
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-NEXT: v_mov_b32_e32 v4, s1
; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v1
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB2_3
; GCN-NEXT: ; %bb.2: ; %bb.else
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mov_b32_e32 v0, 2
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8
; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4
; GCN-NEXT: .LBB2_3: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB2_5
; GCN-NEXT: ; %bb.4: ; %bb.then
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:4
; GCN-NEXT: .LBB2_5: ; %bb.outer.end
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v2, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: nested_if_if_else:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1]
; GCN-O0-NEXT: v_writelane_b32 v1, s2, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s3, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s4, 2
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s4
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_6
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
; GCN-O0-NEXT: v_writelane_b32 v1, s2, 4
; GCN-O0-NEXT: v_writelane_b32 v1, s3, 5
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_2
; GCN-O0-NEXT: s_branch .LBB2_4
; GCN-O0-NEXT: .LBB2_2: ; %Flow
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5
; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 6
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 7
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_5
; GCN-O0-NEXT: ; %bb.3: ; %bb.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s2, 2
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB2_5
; GCN-O0-NEXT: .LBB2_4: ; %bb.else
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], v0
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB2_2
; GCN-O0-NEXT: .LBB2_5: ; %Flow1
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v0, v2
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp1, align 4
%tmp2 = icmp ugt i32 %tmp, 1
br i1 %tmp2, label %bb.outer.then, label %bb.outer.end
bb.outer.then: ; preds = %bb
%tmp5 = icmp eq i32 %tmp, 2
br i1 %tmp5, label %bb.then, label %bb.else
bb.then: ; preds = %bb.outer.then
%tmp3 = add i32 %tmp, 1
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp3
store i32 1, ptr addrspace(1) %tmp4, align 4
br label %bb.outer.end
bb.else: ; preds = %bb.outer.then
%tmp7 = add i32 %tmp, 2
%tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
store i32 2, ptr addrspace(1) %tmp9, align 4
br label %bb.outer.end
bb.outer.end: ; preds = %bb, %bb.then, %bb.else
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: nested_if_else_if:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 2, v0
; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB3_4
; GCN-NEXT: ; %bb.1: ; %bb.outer.else
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: v_mov_b32_e32 v3, 3
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:12
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_cbranch_execz .LBB3_3
; GCN-NEXT: ; %bb.2: ; %bb.inner.then2
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s8, s10
; GCN-NEXT: s_mov_b32 s9, s10
; GCN-NEXT: v_mov_b32_e32 v0, 4
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 offset:16
; GCN-NEXT: .LBB3_3: ; %Flow
; GCN-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: .LBB3_4: ; %Flow2
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB3_8
; GCN-NEXT: ; %bb.5: ; %bb.outer.then
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, 1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:4
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB3_7
; GCN-NEXT: ; %bb.6: ; %bb.inner.then
; GCN-NEXT: v_mov_b32_e32 v0, 2
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-NEXT: .LBB3_7: ; %Flow1
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: .LBB3_8: ; %bb.outer.end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: nested_if_else_if:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: s_mov_b32 s1, s0
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s1
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v6, v4
; GCN-O0-NEXT: v_add_i32_e64 v5, s[2:3], s2, v2
; GCN-O0-NEXT: v_mov_b32_e32 v2, s1
; GCN-O0-NEXT: v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3]
; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v6, v2
; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
; GCN-O0-NEXT: v_writelane_b32 v1, s2, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s3, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_1
; GCN-O0-NEXT: s_branch .LBB3_4
; GCN-O0-NEXT: .LBB3_1: ; %Flow2
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_8
; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_7
; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:8
; GCN-O0-NEXT: s_branch .LBB3_7
; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s4, s0
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 6
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 7
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_6
; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, 4
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:16
; GCN-O0-NEXT: .LBB3_6: ; %Flow
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_branch .LBB3_1
; GCN-O0-NEXT: .LBB3_7: ; %Flow1
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v0, v2
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp1, align 4
%cc1 = icmp ugt i32 %tmp, 1
br i1 %cc1, label %bb.outer.then, label %bb.outer.else
bb.outer.then:
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 1
store i32 1, ptr addrspace(1) %tmp2, align 4
%cc2 = icmp eq i32 %tmp, 2
br i1 %cc2, label %bb.inner.then, label %bb.outer.end
bb.inner.then:
%tmp3 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 2
store i32 2, ptr addrspace(1) %tmp3, align 4
br label %bb.outer.end
bb.outer.else:
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 3
store i32 3, ptr addrspace(1) %tmp4, align 4
%cc3 = icmp eq i32 %tmp, 0 ; avoid being optimized away through the domination
br i1 %cc3, label %bb.inner.then2, label %bb.outer.end
bb.inner.then2:
%tmp5 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 4
store i32 4, ptr addrspace(1) %tmp5, align 4
br label %bb.outer.end
bb.outer.end:
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: s_endpgm_unsafe_barrier:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GCN-NEXT: s_cbranch_execz .LBB4_2
; GCN-NEXT: ; %bb.1: ; %bb.then
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v1, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: .LBB4_2: ; %bb.end
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_barrier
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: s_endpgm_unsafe_barrier:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB4_2
; GCN-O0-NEXT: ; %bb.1: ; %bb.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_ashrrev_i32_e64 v0, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v0
; GCN-O0-NEXT: s_mov_b32 s4, 2
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: .LBB4_2: ; %bb.end
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: s_barrier
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = icmp ugt i32 %tmp, 1
br i1 %tmp1, label %bb.then, label %bb.end
bb.then: ; preds = %bb
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp4, align 4
br label %bb.end
bb.end: ; preds = %bb.then, %bb
call void @llvm.amdgcn.s.barrier()
ret void
}
define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-LABEL: scc_liveness:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_movk_i32 s4, 0x207
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GCN-NEXT: s_mov_b32 s8, 0
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GCN-NEXT: s_mov_b64 s[12:13], 0
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_branch .LBB5_3
; GCN-NEXT: .LBB5_1: ; %Flow
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN-NEXT: .LBB5_2: ; %bb10
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[14:15]
; GCN-NEXT: s_and_b64 s[6:7], exec, s[4:5]
; GCN-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13]
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_andn2_b64 exec, exec, s[12:13]
; GCN-NEXT: s_cbranch_execz .LBB5_7
; GCN-NEXT: .LBB5_3: ; %bb1
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_and_b64 s[10:11], exec, vcc
; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GCN-NEXT: s_cbranch_execnz .LBB5_3
; GCN-NEXT: ; %bb.4: ; %bb2
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_mov_b32 s10, s8
; GCN-NEXT: s_mov_b32 s11, s8
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_and_saveexec_b64 s[14:15], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB5_2
; GCN-NEXT: ; %bb.5: ; %bb4
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[6:7]
; GCN-NEXT: s_cbranch_execz .LBB5_1
; GCN-NEXT: ; %bb.6: ; %bb8
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_branch .LBB5_1
; GCN-NEXT: .LBB5_7: ; %bb12
; GCN-NEXT: s_or_b64 exec, exec, s[12:13]
; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: scc_liveness:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], 0
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 1
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 3
; GCN-O0-NEXT: .LBB5_1: ; %bb1
; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s8, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s9, v1, 3
; GCN-O0-NEXT: v_readlane_b32 s6, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s7, v1, 1
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 4
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 5
; GCN-O0-NEXT: s_mov_b32 s4, 0x207
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, s4
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 6
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 7
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 1
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 3
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
; GCN-O0-NEXT: ; %bb.2: ; %bb2
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 6
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 7
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_mov_b32 s6, 0
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 8
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 9
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: s_mov_b32 s8, s4
; GCN-O0-NEXT: s_mov_b32 s9, s4
; GCN-O0-NEXT: s_mov_b32 s10, s4
; GCN-O0-NEXT: s_mov_b32 s11, s4
; GCN-O0-NEXT: v_mov_b32_e32 v2, s8
; GCN-O0-NEXT: v_mov_b32_e32 v3, s9
; GCN-O0-NEXT: v_mov_b32_e32 v4, s10
; GCN-O0-NEXT: v_mov_b32_e32 v5, s11
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 10
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 11
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execz .LBB5_5
; GCN-O0-NEXT: ; %bb.3: ; %bb4
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v0, s4
; GCN-O0-NEXT: s_mov_b32 s8, s4
; GCN-O0-NEXT: s_mov_b32 s9, s4
; GCN-O0-NEXT: s_mov_b32 s10, s4
; GCN-O0-NEXT: s_mov_b32 s11, s4
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, s8
; GCN-O0-NEXT: v_mov_b32_e32 v3, s9
; GCN-O0-NEXT: v_mov_b32_e32 v4, s10
; GCN-O0-NEXT: v_mov_b32_e32 v5, s11
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 12
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 13
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execz .LBB5_6
; GCN-O0-NEXT: ; %bb.4: ; %bb8
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_mov_b32 s10, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: ; implicit-def: $sgpr5
; GCN-O0-NEXT: ; implicit-def: $sgpr9
; GCN-O0-NEXT: ; implicit-def: $sgpr5
; GCN-O0-NEXT: ; implicit-def: $sgpr8
; GCN-O0-NEXT: ; implicit-def: $sgpr5
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b32 s5, s10
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v4, s6
; GCN-O0-NEXT: v_mov_b32_e32 v5, s7
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_6
; GCN-O0-NEXT: .LBB5_5: ; %Flow2
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 10
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 11
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_7
; GCN-O0-NEXT: .LBB5_6: ; %Flow
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 12
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 13
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_5
; GCN-O0-NEXT: .LBB5_7: ; %bb10
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: v_readlane_b32 s6, v1, 8
; GCN-O0-NEXT: v_readlane_b32 s7, v1, 9
; GCN-O0-NEXT: s_mov_b64 s[4:5], -1
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 16
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 17
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execz .LBB5_9
; GCN-O0-NEXT: ; %bb.8: ; %Flow1
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_mov_b64 s[4:5], 0
; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15
; GCN-O0-NEXT: .LBB5_9: ; %Flow3
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s8, v1, 16
; GCN-O0-NEXT: v_readlane_b32 s9, v1, 17
; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-O0-NEXT: v_readlane_b32 s6, v1, 4
; GCN-O0-NEXT: v_readlane_b32 s7, v1, 5
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 14
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 15
; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5]
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0
; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v1, s8, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s9, 1
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 3
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 18
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 19
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
; GCN-O0-NEXT: ; %bb.10: ; %bb12
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 18
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 19
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: ; %bb.11: ; %bb12
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v5
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v4
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v3
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v2
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
bb:
br label %bb1
bb1: ; preds = %Flow1, %bb1, %bb
%tmp = icmp slt i32 %arg, 519
br i1 %tmp, label %bb2, label %bb1
bb2: ; preds = %bb1
%tmp3 = icmp eq i32 %arg, 0
br i1 %tmp3, label %bb4, label %bb10
bb4: ; preds = %bb2
%tmp6 = load float, ptr addrspace(5) undef
%tmp7 = fcmp olt float %tmp6, 0.0
br i1 %tmp7, label %bb8, label %Flow
bb8: ; preds = %bb4
%tmp9 = insertelement <4 x float> undef, float 0.0, i32 1
br label %Flow
Flow: ; preds = %bb8, %bb4
%tmp8 = phi <4 x float> [ %tmp9, %bb8 ], [ zeroinitializer, %bb4 ]
br label %bb10
bb10: ; preds = %Flow, %bb2
%tmp11 = phi <4 x float> [ zeroinitializer, %bb2 ], [ %tmp8, %Flow ]
br i1 %tmp3, label %bb12, label %Flow1
Flow1: ; preds = %bb10
br label %bb1
bb12: ; preds = %bb10
store volatile <4 x float> %tmp11, ptr addrspace(5) undef, align 16
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare void @llvm.amdgcn.s.barrier() #1
attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind convergent }
attributes #2 = { nounwind }