Files
clang-p2996/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
Austin Kerbow f5199d7ae0 [AMDGPU] Revise handling of preexisting waitcnt
Preexisting waitcnt may not update the scoreboard if the instruction
being examined needed to wait on fewer counters than what was encoded in
the old waitcnt instruction. Fixing this results in the elimination of
some redudnat waitcnt.

These changes also enable combining consecutive waitcnt into a single
S_WAITCNT or S_WAITCNT_VSCNT instruction.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D100281
2021-05-05 17:21:33 -07:00

86 lines
3.3 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
define i32 @atomic_nand_i32_lds(i32 addrspace(3)* %ptr) nounwind {
; GCN-LABEL: atomic_nand_i32_lds:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: BB0_1: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-NEXT: v_not_b32_e32 v1, v2
; GCN-NEXT: v_or_b32_e32 v1, -5, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz BB0_1
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
ret i32 %result
}
define i32 @atomic_nand_i32_global(i32 addrspace(1)* %ptr) nounwind {
; GCN-LABEL: atomic_nand_i32_global:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: global_load_dword v2, v[0:1], off
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: BB1_1: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, v2
; GCN-NEXT: v_not_b32_e32 v2, v3
; GCN-NEXT: v_or_b32_e32 v2, -5, v2
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz BB1_1
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw nand i32 addrspace(1)* %ptr, i32 4 seq_cst
ret i32 %result
}
define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind {
; GCN-LABEL: atomic_nand_i32_flat:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_dword v2, v[0:1]
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: BB2_1: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, v2
; GCN-NEXT: v_not_b32_e32 v2, v3
; GCN-NEXT: v_or_b32_e32 v2, -5, v2
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz BB2_1
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw nand i32* %ptr, i32 4 seq_cst
ret i32 %result
}