Preexisting waitcnt may not update the scoreboard if the instruction being examined needed to wait on fewer counters than what was encoded in the old waitcnt instruction. Fixing this results in the elimination of some redudnat waitcnt. These changes also enable combining consecutive waitcnt into a single S_WAITCNT or S_WAITCNT_VSCNT instruction. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D100281
86 lines
3.3 KiB
LLVM
86 lines
3.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
|
|
|
define i32 @atomic_nand_i32_lds(i32 addrspace(3)* %ptr) nounwind {
|
|
; GCN-LABEL: atomic_nand_i32_lds:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: ds_read_b32 v1, v0
|
|
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN-NEXT: BB0_1: ; %atomicrmw.start
|
|
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v1
|
|
; GCN-NEXT: v_not_b32_e32 v1, v2
|
|
; GCN-NEXT: v_or_b32_e32 v1, -5, v1
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
|
|
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN-NEXT: s_cbranch_execnz BB0_1
|
|
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v1
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
|
|
ret i32 %result
|
|
}
|
|
|
|
define i32 @atomic_nand_i32_global(i32 addrspace(1)* %ptr) nounwind {
|
|
; GCN-LABEL: atomic_nand_i32_global:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: global_load_dword v2, v[0:1], off
|
|
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN-NEXT: BB1_1: ; %atomicrmw.start
|
|
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v2
|
|
; GCN-NEXT: v_not_b32_e32 v2, v3
|
|
; GCN-NEXT: v_or_b32_e32 v2, -5, v2
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_wbinvl1_vol
|
|
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN-NEXT: s_cbranch_execnz BB1_1
|
|
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v2
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw nand i32 addrspace(1)* %ptr, i32 4 seq_cst
|
|
ret i32 %result
|
|
}
|
|
|
|
define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind {
|
|
; GCN-LABEL: atomic_nand_i32_flat:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN-NEXT: BB2_1: ; %atomicrmw.start
|
|
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v2
|
|
; GCN-NEXT: v_not_b32_e32 v2, v3
|
|
; GCN-NEXT: v_or_b32_e32 v2, -5, v2
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: buffer_wbinvl1_vol
|
|
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN-NEXT: s_cbranch_execnz BB2_1
|
|
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v2
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw nand i32* %ptr, i32 4 seq_cst
|
|
ret i32 %result
|
|
}
|