Files
clang-p2996/llvm/test/CodeGen/AMDGPU/waitcnt.mir
Tony 1bc7bfffdb [AMDGPU] Optimize waitcnt insertion for flat memory operations
Change waitcnt insertion to check the memory operand tokens to see if
flat memory operations access VMEM in the same way it does to check if
accessing LDS. This avoids adding waitcnt for counters for address
spaces that are not accessed.

In addition, only generate the pessimistic waitcnt 0 if a flat memory
operation appears to access both VMEM and LDS.

This benefits flat memory operations that explicitly specify the
address space as GLOBAL or LOCAL.

Differential Revision: https://reviews.llvm.org/D89618
2020-10-20 22:55:12 +00:00

335 lines
9.5 KiB
YAML

# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefixes=CHECK,GFX89 %s
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefixes=CHECK,GFX89 %s
--- |
define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
<4 x i32> addrspace(1)* %global16,
i32* %flat4,
<4 x i32>* %flat16) {
ret void
}
define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() {
ret void
}
define amdgpu_kernel void @single_branch_successor_not_next_block() {
ret void
}
define amdgpu_kernel void @preexisting_waitcnt() {
ret void
}
define amdgpu_kernel void @bundle_no_waitcnt() {
ret void
}
define amdgpu_kernel void @preexisting_waitcnt_in_bundle() {
ret void
}
define amdgpu_kernel void @insert_in_bundle() {
ret void
}
define amdgpu_kernel void @exit_bundle() {
ret void
}
define amdgpu_kernel void @cross_bundle() {
ret void
}
define amdgpu_kernel void @subregs16bit() {
ret void
}
...
---
# CHECK-LABEL: name: flat_zero_waitcnt
# CHECK-LABEL: bb.0:
# CHECK: FLAT_LOAD_DWORD
# CHECK: FLAT_LOAD_DWORDX4
# Global loads will return in order so we should:
# s_waitcnt vmcnt(1)
# CHECK-NEXT: S_WAITCNT 3953
# CHECK-LABEL: bb.1:
# CHECK: FLAT_LOAD_DWORD
# s_waitcnt vmcnt(0)
# GFX89: S_WAITCNT 3952
# CHECK: FLAT_LOAD_DWORDX4
# CHECK-LABEL: bb.2:
# CHECK: FLAT_LOAD_DWORD
# s_waitcnt vmcnt(0)
# GFX89: S_WAITCNT 3952
# CHECK: FLAT_LOAD_DWORDX4
# CHECK-LABEL: bb.3:
# s_waitcnt vmcnt(0)
# GFX89: S_WAITCNT 3952
# CHECK: FLAT_LOAD_DWORD
# CHECK: FLAT_LOAD_DWORD
# s_waitcnt vmcnt(0) lgkmcnt(0)
# GFX89: S_WAITCNT 112
# CHECK-LABEL: bb.4:
# GFX89-NOT: S_WAITCNT
# CHECK: FLAT_LOAD_DWORD
# s_waitcnt vmcnt(0) lgkmcnt(0)
# GFX89: S_WAITCNT 112
name: flat_zero_waitcnt
body: |
bb.0:
successors: %bb.1
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
$vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
$vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.2
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
$vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
$vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
S_BRANCH %bb.2
bb.2:
successors: %bb.3
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
$vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16)
$vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
S_BRANCH %bb.3
bb.3:
successors: %bb.4
$vgpr3 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
$vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
$vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec
S_BRANCH %bb.4
bb.4:
$vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
$vgpr0 = V_MOV_B32_e32 $vgpr5, implicit $exec
S_ENDPGM 0
...
---
# There is only a single fallthrough successor block, so there's no
# need to wait immediately.
# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait
# CHECK: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2
# CHECK-NOT: S_WAITCNT
# CHECK: bb.1:
# CHECK-NEXT: V_LSHLREV_B64
# CHECK-NEXT: S_WAITCNT 112
# CHECK-NEXT: FLAT_STORE_DWORD
name: single_fallthrough_successor_no_end_block_wait
body: |
bb.0:
successors: %bb.1
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
bb.1:
$vgpr3_vgpr4 = V_LSHLREV_B64 4, $vgpr7_vgpr8, implicit $exec
FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
# The block has a single predecessor with a single successor, but it
# is not the next block so it's non-obvious that the wait is not needed.
# CHECK-LABEL: name: single_branch_successor_not_next_block
# CHECK: bb.1
# CHECK-NEXT: FLAT_STORE_DWORD
# CHECK-NEXT: S_ENDPGM 0
# CHECK: bb.2:
# CHECK-NEXT: V_LSHLREV_B64
# CHECK-NEXT: S_WAITCNT 112
# CHECK-NEXT: FLAT_STORE_DWORD
name: single_branch_successor_not_next_block
body: |
bb.0:
successors: %bb.2
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
S_BRANCH %bb.2
bb.1:
FLAT_STORE_DWORD $vgpr8_vgpr9, $vgpr10, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
bb.2:
$vgpr3_vgpr4 = V_LSHLREV_B64 4, $vgpr7_vgpr8, implicit $exec
FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
# CHECK-LABEL: name: preexisting_waitcnt{{$}}
# CHECK: FLAT_LOAD_DWORD
# CHECK-NEXT: S_WAITCNT 0
# CHECK-NOT: S_WAITCNT
name: preexisting_waitcnt
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
liveins: $vgpr1_vgpr2
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
S_WAITCNT 0
FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
...
---
# CHECK-LABEL: name: bundle_no_waitcnt{{$}}
# CHECK: FLAT_LOAD_DWORD
# CHECK-NEXT: BUNDLE
# CHECK-NEXT: S_NOP
# CHECK-NEXT: S_NOP
# CHECK-NEXT: }
# CHECK-NEXT: S_WAITCNT 112
name: bundle_no_waitcnt
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
liveins: $vgpr1_vgpr2
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
BUNDLE {
S_NOP 0
S_NOP 0
}
FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
...
---
# See the waitcnt inside the bundle and don't insert an extra
# CHECK-LABEL: name: preexisting_waitcnt_in_bundle{{$}}
# CHECK: FLAT_LOAD_DWORD
# CHECK: S_WAITCNT 0
# CHECK-NOT: S_WAITCNT
name: preexisting_waitcnt_in_bundle
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
liveins: $vgpr1_vgpr2
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
BUNDLE {
S_NOP 0
S_WAITCNT 0
}
FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
...
---
# Def and use inside bundle
# CHECK-LABEL: name: insert_in_bundle{{$}}
# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
# CHECK-NEXT: S_WAITCNT 112
# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
# CHECK-NEXT: }
name: insert_in_bundle
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
liveins: $vgpr1_vgpr2
BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
}
...
---
# Def is last instruction in bundle, use is outside bundle
# CHECK-LABEL: name: exit_bundle{{$}}
# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
# CHECK-NEXT: }
# CHECK-NEXT: S_WAITCNT 112
# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
name: exit_bundle
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
liveins: $vgpr1_vgpr2
BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
}
FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
...
---
# Def is in bundle, use is in another bundle
# CHECK-LABEL: name: cross_bundle{{$}}
# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
# CHECK-NEXT: }
# CHECK-NEXT: S_WAITCNT 112
# CHECK-NEXT: BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 {
# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
# CHECK-NEXT: }
name: cross_bundle
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
liveins: $vgpr1_vgpr2
BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
}
BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 {
FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
}
...
---
# CHECK-LABEL: name: subregs16bit
# CHECK: S_WAITCNT 112
# CHECK-NEXT: V_NOP_e32
name: subregs16bit
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4
$vgpr0 = FLAT_LOAD_USHORT killed $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
$vgpr1 = FLAT_LOAD_USHORT killed $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
V_NOP_e32 implicit $exec, implicit $vgpr0_lo16, implicit $vgpr1_lo16
...