Files
clang-p2996/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
Austin Kerbow e501ed84aa [AMDGPU] Don't flush vmcnt for loops with use/def pairs
Conditions for hoisting vmcnt with flat instructions should be similar to VMEM.
If there are use/def pairs in a loop body we cannot guarantee that hosting the
waitcnt will be profitable. Better heuristics are needed to analyse whether
gains from avoiding waitcnt in loop bodys outweighs waiting for loads in the
preheader.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D151126
2023-06-02 22:55:12 -07:00

738 lines
19 KiB
YAML

# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s
---
# The loop contains a store and a use of a value loaded outside of the loop.
# We expect the waitcnt for the use to be hoisted on GFX9, but not on GFX10+
# because we have the vscnt counter.
# GFX9-LABEL: waitcnt_vm_loop
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# Same as before, but the loop preheader has no terminator.
# GFX9-LABEL: waitcnt_vm_loop_noterm
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop_noterm
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop_noterm
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
bb.1:
successors: %bb.1, %bb.2
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# Same as before but there is a preexisting waitcnt in the preheader.
# GFX9-LABEL: waitcnt_vm_loop_noterm_wait
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
name: waitcnt_vm_loop_noterm_wait
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_WAITCNT 3952
bb.1:
successors: %bb.1, %bb.2
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# The loop contains a store, a load, and uses values loaded both inside and
# outside the loop.
# We do not expect the waitcnt to be hoisted out of the loop.
# GFX9-LABEL: waitcnt_vm_loop_load
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop_load
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop_load
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
$vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# The loop contains a use of a value loaded outside of the loop, and no store
# nor load.
# We do not expect the waitcnt to be hoisted out of the loop.
# GFX9-LABEL: waitcnt_vm_loop_no_store
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop_no_store
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop_no_store
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# The loop contains a store, no load, and doesn't use any value loaded inside
# or outside of the loop. There is only one use of the loaded value in the
# exit block.
# We don't expect any s_waitcnt vmcnt in the loop body or preheader, but expect
# one in the exit block.
# GFX9-LABEL: waitcnt_vm_loop_no_use
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop_no_use
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop_no_use
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_ENDPGM 0
...
---
# The loop loads a value that is not used in the loop, and uses a value loaded
# outside of the loop.
# We expect the waitcnt to be hoisted of the loop to wait a single time before
# the loop is executed and avoid waiting for the load to complete on each
# iteration.
# GFX9-LABEL: waitcnt_vm_loop2
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop2
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop2
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
$vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# Same as before with an additional store in the loop. We still expect the
# waitcnt instructions to be hoisted.
# GFX9-LABEL: waitcnt_vm_loop2_store
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop2_store
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop2_store
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
$vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# Same as loop2 but the value loaded inside the loop is also used in the loop.
# We do not expect the waitcnt to be hoisted out of the loop.
# GFX9-LABEL: waitcnt_vm_loop2_use_in_loop
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop2_use_in_loop
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop2_use_in_loop
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
$vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
$vgpr4 = V_ADD_U32_e32 $vgpr5, $vgpr1, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# The loop contains a use of a value loaded outside of the loop, but we already
# waited for that load to complete. The loop also loads a value that is not used
# in the loop. We do not expect any waitcnt in the loop.
# GFX9-LABEL: waitcnt_vm_loop2_nowait
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.3:
# GFX10-LABEL: waitcnt_vm_loop2_nowait
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.2:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.3:
name: waitcnt_vm_loop2_nowait
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.2
$vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
$vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
$vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
S_BRANCH %bb.2
bb.2:
successors: %bb.2, %bb.3
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
$vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.2, implicit killed $scc
S_BRANCH %bb.3
bb.3:
S_ENDPGM 0
...
---
# Similar test case but for register intervals.
# GFX9-LABEL: waitcnt_vm_loop2_reginterval
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop2_reginterval
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop2_reginterval
body: |
bb.0:
successors: %bb.1
$vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr10 = COPY $vgpr0
$vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# Similar test case but for register intervals.
# GFX9-LABEL: waitcnt_vm_loop2_reginterval2
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop2_reginterval2
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop2_reginterval2
body: |
bb.0:
successors: %bb.1
$vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr10 = COPY $vgpr0
$vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
$vgpr11 = COPY $vgpr7
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# The loop loads a value that is not used in the loop, but uses a value loaded
# outside of it. We expect the s_waitcnt instruction to be hoisted.
# A s_waitcnt vmcnt(0) is generated to flush in the preheader, but for this
# specific test case, it would be better to use vmcnt(1) instead. This is
# currently not implemented.
# GFX9-LABEL: waitcnt_vm_zero
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 3952
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_zero
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16240
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT 16240
# GFX10-LABEL: bb.2:
name: waitcnt_vm_zero
body: |
bb.0:
successors: %bb.1
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
$vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr3, implicit $exec
$vgpr2 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr3, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
# This test case checks that we flush the vmcnt counter only if necessary
# (i.e. if a waitcnt is needed for the vgpr use we find in the loop)
# GFX10-LABEL: waitcnt_vm_necessary
# GFX10-LABEL: bb.0:
# GFX10: S_WAITCNT 16240
# GFX10: $vgpr4
# GFX10-NOT: S_WAITCNT
# GFX10-LABEL: bb.1:
# GFX10-NOT: S_WAITCNT
# GFX9-LABEL: waitcnt_vm_necessary
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 3952
# GFX9: $vgpr4
# GFX9-NOT: S_WAITCNT
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT
name: waitcnt_vm_necessary
body: |
bb.0:
successors: %bb.1(0x80000000)
$vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 killed $vgpr0_vgpr1, 0, 0, implicit $exec
$vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
bb.1:
successors: %bb.1(0x40000000)
$vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_ENDPGM 0
...
---
# The loop contains a global store, and uses a (global) loaded value outside of the loop.
# GFX9-LABEL: waitcnt_vm_loop_global_mem
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop_global_mem
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop_global_mem
body: |
bb.0:
successors: %bb.1
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
bb.2:
successors: %bb.3
S_BRANCH %bb.3
bb.3:
S_ENDPGM 0
...
---
# Same as above case, but use scratch memory instructions instead
# GFX9-LABEL: waitcnt_vm_loop_scratch_mem
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop_scratch_mem
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop_scratch_mem
body: |
bb.0:
successors: %bb.1
$vgpr0 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
SCRATCH_STORE_DWORD $vgpr4, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
bb.2:
successors: %bb.3
S_BRANCH %bb.3
bb.3:
S_ENDPGM 0
...
---
# Same as above case, but use flat memory instructions instead
# GFX9-LABEL: waitcnt_vm_loop_flat_mem
# GFX9-LABEL: bb.0:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop_flat_mem
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 11
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 11
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop_flat_mem
body: |
bb.0:
successors: %bb.1
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
FLAT_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
bb.2:
successors: %bb.3
S_BRANCH %bb.3
bb.3:
S_ENDPGM 0
...
---
# The loop contains a store, a load, and uses values loaded both inside and
# outside the loop.
# We do not expect the waitcnt to be hoisted out of the loop.
# GFX9-LABEL: waitcnt_vm_loop_flat_load
# GFX9-LABEL: bb.0:
# GFX9-NOT: S_WAITCNT 39
# GFX9-LABEL: bb.1:
# GFX9: S_WAITCNT 39
# GFX9-LABEL: bb.2:
# GFX10-LABEL: waitcnt_vm_loop_flat_load
# GFX10-LABEL: bb.0:
# GFX10-NOT: S_WAITCNT 16
# GFX10-LABEL: bb.1:
# GFX10: S_WAITCNT 16
# GFX10-LABEL: bb.2:
name: waitcnt_vm_loop_flat_load
body: |
bb.0:
successors: %bb.1
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec
$vgpr7 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...