Call generateWaitcnt unconditionally at the end of SIInsertWaitcnts::insertWaitcntInBlock. Even if we don't need to generate a new waitcnt instruction it has the effect of combining or removing redundant waitcnts that were already present. Tests show various small improvements in waitcnt placement.
467 lines
20 KiB
LLVM
467 lines
20 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1200 %s
|
|
|
|
define float @syncscope_system(ptr %addr, float %val) #0 {
|
|
; GFX908-LABEL: syncscope_system:
|
|
; GFX908: ; %bb.0:
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX908-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX908-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
|
|
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX908-NEXT: v_mov_b32_e32 v4, v3
|
|
; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
|
|
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX908-NEXT: buffer_wbinvl1_vol
|
|
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
|
|
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX908-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX908-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX90A-LABEL: syncscope_system:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX940-LABEL: syncscope_system:
|
|
; GFX940: ; %bb.0:
|
|
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX940-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
|
|
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX940-NEXT: buffer_inv sc0 sc1
|
|
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1100-LABEL: syncscope_system:
|
|
; GFX1100: ; %bb.0:
|
|
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX1100-NEXT: flat_load_b32 v3, v[0:1]
|
|
; GFX1100-NEXT: s_mov_b32 s0, 0
|
|
; GFX1100-NEXT: .LBB0_1: ; %atomicrmw.start
|
|
; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX1100-NEXT: v_mov_b32_e32 v4, v3
|
|
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2
|
|
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX1100-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
|
; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX1100-NEXT: buffer_gl1_inv
|
|
; GFX1100-NEXT: buffer_gl0_inv
|
|
; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
|
|
; GFX1100-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
|
|
; GFX1100-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
|
; GFX1100-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX1100-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1200-LABEL: syncscope_system:
|
|
; GFX1200: ; %bb.0:
|
|
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1200-NEXT: s_wait_expcnt 0x0
|
|
; GFX1200-NEXT: s_wait_samplecnt 0x0
|
|
; GFX1200-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX1200-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1200-NEXT: flat_load_b32 v3, v[0:1]
|
|
; GFX1200-NEXT: s_mov_b32 s0, 0
|
|
; GFX1200-NEXT: .LBB0_1: ; %atomicrmw.start
|
|
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1200-NEXT: v_mov_b32_e32 v4, v3
|
|
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2
|
|
; GFX1200-NEXT: s_wait_storecnt 0x0
|
|
; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
|
|
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1200-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
|
|
; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
|
|
; GFX1200-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
|
; GFX1200-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX1200-NEXT: s_setpc_b64 s[30:31]
|
|
%res = atomicrmw fadd ptr %addr, float %val seq_cst
|
|
ret float %res
|
|
}
|
|
|
|
define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
|
|
; GFX908-LABEL: syncscope_workgroup_rtn:
|
|
; GFX908: ; %bb.0:
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX908-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX908-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
|
|
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX908-NEXT: v_mov_b32_e32 v4, v3
|
|
; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
|
|
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
|
|
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX908-NEXT: s_cbranch_execnz .LBB1_1
|
|
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX908-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX90A-LABEL: syncscope_workgroup_rtn:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB1_6
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB1_3
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2
|
|
; GFX90A-NEXT: .LBB1_3: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB1_5
|
|
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB1_5: ; %Flow1
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2
|
|
; GFX90A-NEXT: .LBB1_6: ; %Flow2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB1_8
|
|
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX940-LABEL: syncscope_workgroup_rtn:
|
|
; GFX940: ; %bb.0:
|
|
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
|
|
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1100-LABEL: syncscope_workgroup_rtn:
|
|
; GFX1100: ; %bb.0:
|
|
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX1100-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
|
|
; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX1100-NEXT: buffer_gl0_inv
|
|
; GFX1100-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1200-LABEL: syncscope_workgroup_rtn:
|
|
; GFX1200: ; %bb.0:
|
|
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1200-NEXT: s_wait_expcnt 0x0
|
|
; GFX1200-NEXT: s_wait_samplecnt 0x0
|
|
; GFX1200-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX1200-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1200-NEXT: s_wait_storecnt 0x0
|
|
; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
|
|
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1200-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX1200-NEXT: s_setpc_b64 s[30:31]
|
|
%res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
|
|
ret float %res
|
|
}
|
|
|
|
define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
|
|
; GFX908-LABEL: syncscope_workgroup_nortn:
|
|
; GFX908: ; %bb.0: ; %atomicrmw.check.shared
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
|
|
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX908-NEXT: s_cbranch_execnz .LBB2_3
|
|
; GFX908-NEXT: ; %bb.1: ; %Flow2
|
|
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX908-NEXT: s_cbranch_execnz .LBB2_8
|
|
; GFX908-NEXT: .LBB2_2: ; %atomicrmw.phi
|
|
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX908-NEXT: .LBB2_3: ; %atomicrmw.check.private
|
|
; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
|
|
; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
|
; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
|
|
; GFX908-NEXT: s_cbranch_execz .LBB2_5
|
|
; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
|
|
; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
|
|
; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX908-NEXT: ; implicit-def: $vgpr2
|
|
; GFX908-NEXT: .LBB2_5: ; %Flow
|
|
; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
|
; GFX908-NEXT: s_cbranch_execz .LBB2_7
|
|
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
|
|
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
|
|
; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX908-NEXT: .LBB2_7: ; %Flow1
|
|
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX908-NEXT: ; implicit-def: $vgpr2
|
|
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX908-NEXT: s_cbranch_execz .LBB2_2
|
|
; GFX908-NEXT: .LBB2_8: ; %atomicrmw.shared
|
|
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX908-NEXT: ds_add_f32 v0, v2
|
|
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX90A-LABEL: syncscope_workgroup_nortn:
|
|
; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB2_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %Flow2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB2_8
|
|
; GFX90A-NEXT: .LBB2_2: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.check.private
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB2_5
|
|
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
|
|
; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2
|
|
; GFX90A-NEXT: .LBB2_5: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB2_7
|
|
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB2_7: ; %Flow1
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB2_2
|
|
; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.shared
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: ds_add_f32 v0, v2
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX940-LABEL: syncscope_workgroup_nortn:
|
|
; GFX940: ; %bb.0:
|
|
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
|
|
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1100-LABEL: syncscope_workgroup_nortn:
|
|
; GFX1100: ; %bb.0:
|
|
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX1100-NEXT: flat_atomic_add_f32 v[0:1], v2
|
|
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX1100-NEXT: buffer_gl0_inv
|
|
; GFX1100-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1200-LABEL: syncscope_workgroup_nortn:
|
|
; GFX1200: ; %bb.0:
|
|
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1200-NEXT: s_wait_expcnt 0x0
|
|
; GFX1200-NEXT: s_wait_samplecnt 0x0
|
|
; GFX1200-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX1200-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1200-NEXT: s_wait_storecnt 0x0
|
|
; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2
|
|
; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX1200-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX1200-NEXT: s_setpc_b64 s[30:31]
|
|
%res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define float @no_unsafe(ptr %addr, float %val) {
|
|
; GFX908-LABEL: no_unsafe:
|
|
; GFX908: ; %bb.0:
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX908-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX908-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
|
|
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX908-NEXT: v_mov_b32_e32 v4, v3
|
|
; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
|
|
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
|
|
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
|
|
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX908-NEXT: s_cbranch_execnz .LBB3_1
|
|
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX908-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX90A-LABEL: no_unsafe:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX940-LABEL: no_unsafe:
|
|
; GFX940: ; %bb.0:
|
|
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
|
|
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1100-LABEL: no_unsafe:
|
|
; GFX1100: ; %bb.0:
|
|
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX1100-NEXT: flat_load_b32 v3, v[0:1]
|
|
; GFX1100-NEXT: s_mov_b32 s0, 0
|
|
; GFX1100-NEXT: .LBB3_1: ; %atomicrmw.start
|
|
; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX1100-NEXT: v_mov_b32_e32 v4, v3
|
|
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2
|
|
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX1100-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
|
; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX1100-NEXT: buffer_gl0_inv
|
|
; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
|
|
; GFX1100-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
|
|
; GFX1100-NEXT: s_cbranch_execnz .LBB3_1
|
|
; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
|
; GFX1100-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX1100-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1200-LABEL: no_unsafe:
|
|
; GFX1200: ; %bb.0:
|
|
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1200-NEXT: s_wait_expcnt 0x0
|
|
; GFX1200-NEXT: s_wait_samplecnt 0x0
|
|
; GFX1200-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX1200-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1200-NEXT: flat_load_b32 v3, v[0:1]
|
|
; GFX1200-NEXT: s_mov_b32 s0, 0
|
|
; GFX1200-NEXT: .LBB3_1: ; %atomicrmw.start
|
|
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1200-NEXT: v_mov_b32_e32 v4, v3
|
|
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2
|
|
; GFX1200-NEXT: s_wait_storecnt 0x0
|
|
; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
|
|
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1200-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
|
|
; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
|
|
; GFX1200-NEXT: s_cbranch_execnz .LBB3_1
|
|
; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
|
; GFX1200-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX1200-NEXT: s_setpc_b64 s[30:31]
|
|
%res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
|
|
ret float %res
|
|
}
|
|
|
|
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
|