If the runtime flat address resolves to a scratch address, 64-bit atomics do not work correctly. Insert a runtime address space check (which is quite likely to be uniform) and select between the non-atomic and real atomic cases. Consider noalias.addrspace metadata and avoid this expansion when possible (we also need to consider it to avoid infinitely expanding after adding the predication code).
15716 lines
643 KiB
LLVM
15716 lines
643 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN1 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN2 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_add_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB0_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB0_4
|
|
; GCN1-NEXT: .LBB0_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB0_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB0_2
|
|
; GCN1-NEXT: .LBB0_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB0_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB0_4
|
|
; GCN2-NEXT: .LBB0_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB0_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB0_2
|
|
; GCN2-NEXT: .LBB0_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB0_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB0_4
|
|
; GFX12-NEXT: .LBB0_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB0_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB0_2
|
|
; GFX12-NEXT: .LBB0_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_add_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB1_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB1_3
|
|
; GCN1-NEXT: s_branch .LBB1_4
|
|
; GCN1-NEXT: .LBB1_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB1_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v5, vcc, s0, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB1_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB1_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB1_3
|
|
; GCN2-NEXT: s_branch .LBB1_4
|
|
; GCN2-NEXT: .LBB1_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB1_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v5, vcc, s0, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB1_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB1_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB1_3
|
|
; GFX12-NEXT: s_branch .LBB1_4
|
|
; GFX12-NEXT: .LBB1_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB1_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB1_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_add_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB2_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB2_4
|
|
; GCN1-NEXT: .LBB2_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB2_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB2_2
|
|
; GCN1-NEXT: .LBB2_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB2_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB2_4
|
|
; GCN2-NEXT: .LBB2_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB2_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB2_2
|
|
; GCN2-NEXT: .LBB2_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB2_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB2_4
|
|
; GFX12-NEXT: .LBB2_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB2_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB2_2
|
|
; GFX12-NEXT: .LBB2_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_add_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB3_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB3_3
|
|
; GCN1-NEXT: s_branch .LBB3_4
|
|
; GCN1-NEXT: .LBB3_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB3_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v5, vcc, s8, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB3_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB3_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB3_3
|
|
; GCN2-NEXT: s_branch .LBB3_4
|
|
; GCN2-NEXT: .LBB3_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB3_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v5, vcc, s8, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB3_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB3_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB3_3
|
|
; GFX12-NEXT: s_branch .LBB3_4
|
|
; GFX12-NEXT: .LBB3_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB3_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB3_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_add_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB4_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB4_4
|
|
; GCN1-NEXT: .LBB4_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB4_2
|
|
; GCN1-NEXT: .LBB4_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB4_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB4_4
|
|
; GCN2-NEXT: .LBB4_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB4_2
|
|
; GCN2-NEXT: .LBB4_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB4_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB4_4
|
|
; GFX12-NEXT: .LBB4_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB4_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB4_2
|
|
; GFX12-NEXT: .LBB4_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_add_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB5_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB5_3
|
|
; GCN1-NEXT: s_branch .LBB5_4
|
|
; GCN1-NEXT: .LBB5_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB5_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v5, vcc, s0, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB5_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB5_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB5_3
|
|
; GCN2-NEXT: s_branch .LBB5_4
|
|
; GCN2-NEXT: .LBB5_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB5_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v5, vcc, s0, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB5_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB5_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB5_3
|
|
; GFX12-NEXT: s_branch .LBB5_4
|
|
; GFX12-NEXT: .LBB5_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB5_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB5_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_add_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB6_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB6_4
|
|
; GCN1-NEXT: .LBB6_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB6_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB6_2
|
|
; GCN1-NEXT: .LBB6_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB6_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB6_4
|
|
; GCN2-NEXT: .LBB6_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB6_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB6_2
|
|
; GCN2-NEXT: .LBB6_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB6_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB6_4
|
|
; GFX12-NEXT: .LBB6_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB6_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB6_2
|
|
; GFX12-NEXT: .LBB6_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_add_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB7_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB7_3
|
|
; GCN1-NEXT: s_branch .LBB7_4
|
|
; GCN1-NEXT: .LBB7_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB7_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v5, vcc, s8, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB7_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB7_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB7_3
|
|
; GCN2-NEXT: s_branch .LBB7_4
|
|
; GCN2-NEXT: .LBB7_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB7_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v5, vcc, s8, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB7_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB7_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB7_3
|
|
; GFX12-NEXT: s_branch .LBB7_4
|
|
; GFX12-NEXT: .LBB7_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB7_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB7_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_and_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB8_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB8_4
|
|
; GCN1-NEXT: .LBB8_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB8_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB8_2
|
|
; GCN1-NEXT: .LBB8_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_and_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB8_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB8_4
|
|
; GCN2-NEXT: .LBB8_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB8_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB8_2
|
|
; GCN2-NEXT: .LBB8_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_and_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB8_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB8_4
|
|
; GFX12-NEXT: .LBB8_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB8_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB8_2
|
|
; GFX12-NEXT: .LBB8_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_and_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_and_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_and_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB9_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB9_3
|
|
; GCN1-NEXT: s_branch .LBB9_4
|
|
; GCN1-NEXT: .LBB9_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB9_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_and_b32_e32 v4, s0, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v5, s1, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB9_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB9_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB9_3
|
|
; GCN2-NEXT: s_branch .LBB9_4
|
|
; GCN2-NEXT: .LBB9_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB9_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_and_b32_e32 v4, s0, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v5, s1, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB9_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB9_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB9_3
|
|
; GFX12-NEXT: s_branch .LBB9_4
|
|
; GFX12-NEXT: .LBB9_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB9_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_and_b32_e32 v3, s1, v1
|
|
; GFX12-NEXT: v_and_b32_e32 v2, s0, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB9_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_and_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB10_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB10_4
|
|
; GCN1-NEXT: .LBB10_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB10_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB10_2
|
|
; GCN1-NEXT: .LBB10_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_and_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB10_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB10_4
|
|
; GCN2-NEXT: .LBB10_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB10_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB10_2
|
|
; GCN2-NEXT: .LBB10_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_and_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB10_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB10_4
|
|
; GFX12-NEXT: .LBB10_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB10_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB10_2
|
|
; GFX12-NEXT: .LBB10_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_and_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_and_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_and_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB11_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB11_3
|
|
; GCN1-NEXT: s_branch .LBB11_4
|
|
; GCN1-NEXT: .LBB11_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB11_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_and_b32_e32 v4, s8, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v5, s9, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB11_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB11_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB11_3
|
|
; GCN2-NEXT: s_branch .LBB11_4
|
|
; GCN2-NEXT: .LBB11_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB11_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_and_b32_e32 v4, s8, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v5, s9, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB11_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB11_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB11_3
|
|
; GFX12-NEXT: s_branch .LBB11_4
|
|
; GFX12-NEXT: .LBB11_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB11_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_and_b32_e32 v3, s5, v1
|
|
; GFX12-NEXT: v_and_b32_e32 v2, s4, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB11_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_and_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB12_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB12_4
|
|
; GCN1-NEXT: .LBB12_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB12_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB12_2
|
|
; GCN1-NEXT: .LBB12_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_and_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB12_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB12_4
|
|
; GCN2-NEXT: .LBB12_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB12_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB12_2
|
|
; GCN2-NEXT: .LBB12_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_and_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB12_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB12_4
|
|
; GFX12-NEXT: .LBB12_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB12_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB12_2
|
|
; GFX12-NEXT: .LBB12_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_and_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_and_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_and_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB13_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB13_3
|
|
; GCN1-NEXT: s_branch .LBB13_4
|
|
; GCN1-NEXT: .LBB13_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB13_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_and_b32_e32 v4, s0, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v5, s1, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB13_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB13_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB13_3
|
|
; GCN2-NEXT: s_branch .LBB13_4
|
|
; GCN2-NEXT: .LBB13_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB13_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_and_b32_e32 v4, s0, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v5, s1, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB13_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB13_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB13_3
|
|
; GFX12-NEXT: s_branch .LBB13_4
|
|
; GFX12-NEXT: .LBB13_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB13_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_and_b32_e32 v3, s1, v1
|
|
; GFX12-NEXT: v_and_b32_e32 v2, s0, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB13_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_and_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB14_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB14_4
|
|
; GCN1-NEXT: .LBB14_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB14_2
|
|
; GCN1-NEXT: .LBB14_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_and_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB14_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB14_4
|
|
; GCN2-NEXT: .LBB14_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB14_2
|
|
; GCN2-NEXT: .LBB14_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_and_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB14_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB14_4
|
|
; GFX12-NEXT: .LBB14_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB14_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB14_2
|
|
; GFX12-NEXT: .LBB14_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_and_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_and_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_and_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB15_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB15_3
|
|
; GCN1-NEXT: s_branch .LBB15_4
|
|
; GCN1-NEXT: .LBB15_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB15_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_and_b32_e32 v4, s8, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v5, s9, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB15_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB15_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB15_3
|
|
; GCN2-NEXT: s_branch .LBB15_4
|
|
; GCN2-NEXT: .LBB15_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB15_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_and_b32_e32 v4, s8, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v5, s9, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB15_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB15_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB15_3
|
|
; GFX12-NEXT: s_branch .LBB15_4
|
|
; GFX12-NEXT: .LBB15_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB15_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_and_b32_e32 v3, s5, v1
|
|
; GFX12-NEXT: v_and_b32_e32 v2, s4, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB15_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_sub_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB16_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB16_4
|
|
; GCN1-NEXT: .LBB16_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB16_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB16_2
|
|
; GCN1-NEXT: .LBB16_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB16_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB16_4
|
|
; GCN2-NEXT: .LBB16_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB16_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB16_2
|
|
; GCN2-NEXT: .LBB16_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB16_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB16_4
|
|
; GFX12-NEXT: .LBB16_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB16_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB16_2
|
|
; GFX12-NEXT: .LBB16_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
|
|
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_sub_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB17_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB17_3
|
|
; GCN1-NEXT: s_branch .LBB17_4
|
|
; GCN1-NEXT: .LBB17_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB17_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB17_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB17_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB17_3
|
|
; GCN2-NEXT: s_branch .LBB17_4
|
|
; GCN2-NEXT: .LBB17_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB17_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB17_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB17_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB17_3
|
|
; GFX12-NEXT: s_branch .LBB17_4
|
|
; GFX12-NEXT: .LBB17_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB17_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
|
|
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB17_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_sub_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB18_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB18_4
|
|
; GCN1-NEXT: .LBB18_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB18_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB18_2
|
|
; GCN1-NEXT: .LBB18_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB18_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB18_4
|
|
; GCN2-NEXT: .LBB18_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB18_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB18_2
|
|
; GCN2-NEXT: .LBB18_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB18_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB18_4
|
|
; GFX12-NEXT: .LBB18_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB18_2
|
|
; GFX12-NEXT: .LBB18_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
|
|
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_sub_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB19_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB19_3
|
|
; GCN1-NEXT: s_branch .LBB19_4
|
|
; GCN1-NEXT: .LBB19_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB19_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s8, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB19_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB19_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB19_3
|
|
; GCN2-NEXT: s_branch .LBB19_4
|
|
; GCN2-NEXT: .LBB19_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB19_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB19_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB19_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB19_3
|
|
; GFX12-NEXT: s_branch .LBB19_4
|
|
; GFX12-NEXT: .LBB19_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB19_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4
|
|
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB19_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_sub_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB20_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB20_4
|
|
; GCN1-NEXT: .LBB20_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB20_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB20_2
|
|
; GCN1-NEXT: .LBB20_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB20_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB20_4
|
|
; GCN2-NEXT: .LBB20_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB20_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB20_2
|
|
; GCN2-NEXT: .LBB20_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB20_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB20_4
|
|
; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB20_2
|
|
; GFX12-NEXT: .LBB20_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
|
|
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_sub_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB21_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB21_3
|
|
; GCN1-NEXT: s_branch .LBB21_4
|
|
; GCN1-NEXT: .LBB21_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB21_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB21_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB21_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB21_3
|
|
; GCN2-NEXT: s_branch .LBB21_4
|
|
; GCN2-NEXT: .LBB21_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB21_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB21_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB21_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB21_3
|
|
; GFX12-NEXT: s_branch .LBB21_4
|
|
; GFX12-NEXT: .LBB21_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB21_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
|
|
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB21_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_sub_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB22_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB22_4
|
|
; GCN1-NEXT: .LBB22_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB22_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB22_2
|
|
; GCN1-NEXT: .LBB22_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB22_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB22_4
|
|
; GCN2-NEXT: .LBB22_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB22_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB22_2
|
|
; GCN2-NEXT: .LBB22_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB22_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB22_4
|
|
; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB22_2
|
|
; GFX12-NEXT: .LBB22_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
|
|
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_sub_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB23_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB23_3
|
|
; GCN1-NEXT: s_branch .LBB23_4
|
|
; GCN1-NEXT: .LBB23_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB23_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s8, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB23_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB23_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB23_3
|
|
; GCN2-NEXT: s_branch .LBB23_4
|
|
; GCN2-NEXT: .LBB23_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB23_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB23_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB23_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB23_3
|
|
; GFX12-NEXT: s_branch .LBB23_4
|
|
; GFX12-NEXT: .LBB23_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB23_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4
|
|
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB23_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_max_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB24_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB24_4
|
|
; GCN1-NEXT: .LBB24_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB24_2
|
|
; GCN1-NEXT: .LBB24_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB24_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB24_4
|
|
; GCN2-NEXT: .LBB24_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB24_2
|
|
; GCN2-NEXT: .LBB24_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB24_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB24_4
|
|
; GFX12-NEXT: .LBB24_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB24_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB24_2
|
|
; GFX12-NEXT: .LBB24_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_max_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB25_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB25_3
|
|
; GCN1-NEXT: s_branch .LBB25_4
|
|
; GCN1-NEXT: .LBB25_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB25_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB25_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB25_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB25_3
|
|
; GCN2-NEXT: s_branch .LBB25_4
|
|
; GCN2-NEXT: .LBB25_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB25_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB25_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB25_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB25_3
|
|
; GFX12-NEXT: s_branch .LBB25_4
|
|
; GFX12-NEXT: .LBB25_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB25_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB25_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB26_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB26_4
|
|
; GCN1-NEXT: .LBB26_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB26_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB26_2
|
|
; GCN1-NEXT: .LBB26_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB26_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB26_4
|
|
; GCN2-NEXT: .LBB26_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB26_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB26_2
|
|
; GCN2-NEXT: .LBB26_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB26_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB26_4
|
|
; GFX12-NEXT: .LBB26_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB26_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB26_2
|
|
; GFX12-NEXT: .LBB26_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB27_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB27_3
|
|
; GCN1-NEXT: s_branch .LBB27_4
|
|
; GCN1-NEXT: .LBB27_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB27_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB27_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB27_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB27_3
|
|
; GCN2-NEXT: s_branch .LBB27_4
|
|
; GCN2-NEXT: .LBB27_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB27_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB27_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB27_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB27_3
|
|
; GFX12-NEXT: s_branch .LBB27_4
|
|
; GFX12-NEXT: .LBB27_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB27_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB27_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_max_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB28_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB28_4
|
|
; GCN1-NEXT: .LBB28_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB28_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB28_2
|
|
; GCN1-NEXT: .LBB28_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB28_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB28_4
|
|
; GCN2-NEXT: .LBB28_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB28_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB28_2
|
|
; GCN2-NEXT: .LBB28_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB28_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB28_4
|
|
; GFX12-NEXT: .LBB28_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB28_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB28_2
|
|
; GFX12-NEXT: .LBB28_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_max_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB29_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB29_3
|
|
; GCN1-NEXT: s_branch .LBB29_4
|
|
; GCN1-NEXT: .LBB29_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB29_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB29_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB29_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB29_3
|
|
; GCN2-NEXT: s_branch .LBB29_4
|
|
; GCN2-NEXT: .LBB29_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB29_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB29_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB29_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB29_3
|
|
; GFX12-NEXT: s_branch .LBB29_4
|
|
; GFX12-NEXT: .LBB29_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB29_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB29_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB30_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB30_4
|
|
; GCN1-NEXT: .LBB30_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB30_2
|
|
; GCN1-NEXT: .LBB30_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB30_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB30_4
|
|
; GCN2-NEXT: .LBB30_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB30_2
|
|
; GCN2-NEXT: .LBB30_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB30_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB30_4
|
|
; GFX12-NEXT: .LBB30_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB30_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB30_2
|
|
; GFX12-NEXT: .LBB30_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB31_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB31_3
|
|
; GCN1-NEXT: s_branch .LBB31_4
|
|
; GCN1-NEXT: .LBB31_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB31_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB31_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB31_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB31_3
|
|
; GCN2-NEXT: s_branch .LBB31_4
|
|
; GCN2-NEXT: .LBB31_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB31_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB31_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB31_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB31_3
|
|
; GFX12-NEXT: s_branch .LBB31_4
|
|
; GFX12-NEXT: .LBB31_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB31_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB31_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_umax_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB32_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB32_4
|
|
; GCN1-NEXT: .LBB32_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB32_2
|
|
; GCN1-NEXT: .LBB32_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB32_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB32_4
|
|
; GCN2-NEXT: .LBB32_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB32_2
|
|
; GCN2-NEXT: .LBB32_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB32_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB32_4
|
|
; GFX12-NEXT: .LBB32_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB32_2
|
|
; GFX12-NEXT: .LBB32_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_umax_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB33_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB33_3
|
|
; GCN1-NEXT: s_branch .LBB33_4
|
|
; GCN1-NEXT: .LBB33_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB33_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB33_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB33_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB33_3
|
|
; GCN2-NEXT: s_branch .LBB33_4
|
|
; GCN2-NEXT: .LBB33_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB33_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB33_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB33_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB33_3
|
|
; GFX12-NEXT: s_branch .LBB33_4
|
|
; GFX12-NEXT: .LBB33_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB33_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB33_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB34_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB34_4
|
|
; GCN1-NEXT: .LBB34_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB34_2
|
|
; GCN1-NEXT: .LBB34_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB34_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB34_4
|
|
; GCN2-NEXT: .LBB34_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB34_2
|
|
; GCN2-NEXT: .LBB34_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB34_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB34_4
|
|
; GFX12-NEXT: .LBB34_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB34_2
|
|
; GFX12-NEXT: .LBB34_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB35_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB35_3
|
|
; GCN1-NEXT: s_branch .LBB35_4
|
|
; GCN1-NEXT: .LBB35_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB35_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB35_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB35_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB35_3
|
|
; GCN2-NEXT: s_branch .LBB35_4
|
|
; GCN2-NEXT: .LBB35_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB35_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB35_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB35_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB35_3
|
|
; GFX12-NEXT: s_branch .LBB35_4
|
|
; GFX12-NEXT: .LBB35_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB35_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB35_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_umax_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB36_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB36_4
|
|
; GCN1-NEXT: .LBB36_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB36_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB36_2
|
|
; GCN1-NEXT: .LBB36_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB36_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB36_4
|
|
; GCN2-NEXT: .LBB36_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB36_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB36_2
|
|
; GCN2-NEXT: .LBB36_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB36_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB36_4
|
|
; GFX12-NEXT: .LBB36_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB36_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB36_2
|
|
; GFX12-NEXT: .LBB36_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_umax_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB37_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB37_3
|
|
; GCN1-NEXT: s_branch .LBB37_4
|
|
; GCN1-NEXT: .LBB37_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB37_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB37_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB37_3
|
|
; GCN2-NEXT: s_branch .LBB37_4
|
|
; GCN2-NEXT: .LBB37_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB37_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB37_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB37_3
|
|
; GFX12-NEXT: s_branch .LBB37_4
|
|
; GFX12-NEXT: .LBB37_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB37_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB37_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB38_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB38_4
|
|
; GCN1-NEXT: .LBB38_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB38_2
|
|
; GCN1-NEXT: .LBB38_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB38_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB38_4
|
|
; GCN2-NEXT: .LBB38_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB38_2
|
|
; GCN2-NEXT: .LBB38_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB38_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB38_4
|
|
; GFX12-NEXT: .LBB38_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB38_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB38_2
|
|
; GFX12-NEXT: .LBB38_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB39_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB39_3
|
|
; GCN1-NEXT: s_branch .LBB39_4
|
|
; GCN1-NEXT: .LBB39_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB39_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB39_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB39_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB39_3
|
|
; GCN2-NEXT: s_branch .LBB39_4
|
|
; GCN2-NEXT: .LBB39_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB39_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB39_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB39_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB39_3
|
|
; GFX12-NEXT: s_branch .LBB39_4
|
|
; GFX12-NEXT: .LBB39_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB39_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB39_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_min_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB40_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB40_4
|
|
; GCN1-NEXT: .LBB40_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB40_2
|
|
; GCN1-NEXT: .LBB40_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB40_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB40_4
|
|
; GCN2-NEXT: .LBB40_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB40_2
|
|
; GCN2-NEXT: .LBB40_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB40_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB40_4
|
|
; GFX12-NEXT: .LBB40_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB40_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB40_2
|
|
; GFX12-NEXT: .LBB40_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_min_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB41_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB41_3
|
|
; GCN1-NEXT: s_branch .LBB41_4
|
|
; GCN1-NEXT: .LBB41_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB41_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB41_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB41_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB41_3
|
|
; GCN2-NEXT: s_branch .LBB41_4
|
|
; GCN2-NEXT: .LBB41_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB41_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB41_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB41_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB41_3
|
|
; GFX12-NEXT: s_branch .LBB41_4
|
|
; GFX12-NEXT: .LBB41_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB41_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[0:1], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB41_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB42_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB42_4
|
|
; GCN1-NEXT: .LBB42_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB42_2
|
|
; GCN1-NEXT: .LBB42_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB42_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB42_4
|
|
; GCN2-NEXT: .LBB42_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB42_2
|
|
; GCN2-NEXT: .LBB42_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB42_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB42_4
|
|
; GFX12-NEXT: .LBB42_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB42_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB42_2
|
|
; GFX12-NEXT: .LBB42_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB43_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB43_3
|
|
; GCN1-NEXT: s_branch .LBB43_4
|
|
; GCN1-NEXT: .LBB43_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB43_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB43_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB43_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB43_3
|
|
; GCN2-NEXT: s_branch .LBB43_4
|
|
; GCN2-NEXT: .LBB43_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB43_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB43_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB43_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB43_3
|
|
; GFX12-NEXT: s_branch .LBB43_4
|
|
; GFX12-NEXT: .LBB43_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB43_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB43_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_min_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB44_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB44_4
|
|
; GCN1-NEXT: .LBB44_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB44_2
|
|
; GCN1-NEXT: .LBB44_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB44_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB44_4
|
|
; GCN2-NEXT: .LBB44_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB44_2
|
|
; GCN2-NEXT: .LBB44_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB44_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB44_4
|
|
; GFX12-NEXT: .LBB44_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB44_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB44_2
|
|
; GFX12-NEXT: .LBB44_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_min_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB45_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB45_3
|
|
; GCN1-NEXT: s_branch .LBB45_4
|
|
; GCN1-NEXT: .LBB45_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB45_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB45_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB45_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB45_3
|
|
; GCN2-NEXT: s_branch .LBB45_4
|
|
; GCN2-NEXT: .LBB45_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB45_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB45_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB45_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB45_3
|
|
; GFX12-NEXT: s_branch .LBB45_4
|
|
; GFX12-NEXT: .LBB45_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB45_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[0:1], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB45_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB46_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB46_4
|
|
; GCN1-NEXT: .LBB46_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB46_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB46_2
|
|
; GCN1-NEXT: .LBB46_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB46_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB46_4
|
|
; GCN2-NEXT: .LBB46_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB46_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB46_2
|
|
; GCN2-NEXT: .LBB46_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB46_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB46_4
|
|
; GFX12-NEXT: .LBB46_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB46_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB46_2
|
|
; GFX12-NEXT: .LBB46_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB47_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB47_3
|
|
; GCN1-NEXT: s_branch .LBB47_4
|
|
; GCN1-NEXT: .LBB47_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB47_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB47_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB47_3
|
|
; GCN2-NEXT: s_branch .LBB47_4
|
|
; GCN2-NEXT: .LBB47_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB47_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB47_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB47_3
|
|
; GFX12-NEXT: s_branch .LBB47_4
|
|
; GFX12-NEXT: .LBB47_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB47_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB47_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_umin_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB48_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB48_4
|
|
; GCN1-NEXT: .LBB48_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB48_2
|
|
; GCN1-NEXT: .LBB48_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB48_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB48_4
|
|
; GCN2-NEXT: .LBB48_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB48_2
|
|
; GCN2-NEXT: .LBB48_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB48_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB48_4
|
|
; GFX12-NEXT: .LBB48_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB48_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB48_2
|
|
; GFX12-NEXT: .LBB48_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_umin_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB49_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB49_3
|
|
; GCN1-NEXT: s_branch .LBB49_4
|
|
; GCN1-NEXT: .LBB49_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB49_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB49_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB49_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB49_3
|
|
; GCN2-NEXT: s_branch .LBB49_4
|
|
; GCN2-NEXT: .LBB49_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB49_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB49_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB49_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB49_3
|
|
; GFX12-NEXT: s_branch .LBB49_4
|
|
; GFX12-NEXT: .LBB49_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB49_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[0:1], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB49_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB50_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB50_4
|
|
; GCN1-NEXT: .LBB50_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB50_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB50_2
|
|
; GCN1-NEXT: .LBB50_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB50_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB50_4
|
|
; GCN2-NEXT: .LBB50_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB50_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB50_2
|
|
; GCN2-NEXT: .LBB50_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB50_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB50_4
|
|
; GFX12-NEXT: .LBB50_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB50_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB50_2
|
|
; GFX12-NEXT: .LBB50_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB51_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB51_3
|
|
; GCN1-NEXT: s_branch .LBB51_4
|
|
; GCN1-NEXT: .LBB51_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB51_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB51_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB51_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB51_3
|
|
; GCN2-NEXT: s_branch .LBB51_4
|
|
; GCN2-NEXT: .LBB51_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB51_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB51_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB51_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB51_3
|
|
; GFX12-NEXT: s_branch .LBB51_4
|
|
; GFX12-NEXT: .LBB51_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB51_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB51_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_umin_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB52_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB52_4
|
|
; GCN1-NEXT: .LBB52_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB52_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB52_2
|
|
; GCN1-NEXT: .LBB52_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB52_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB52_4
|
|
; GCN2-NEXT: .LBB52_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB52_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB52_2
|
|
; GCN2-NEXT: .LBB52_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB52_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB52_4
|
|
; GFX12-NEXT: .LBB52_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB52_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB52_2
|
|
; GFX12-NEXT: .LBB52_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_umin_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB53_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB53_3
|
|
; GCN1-NEXT: s_branch .LBB53_4
|
|
; GCN1-NEXT: .LBB53_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB53_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB53_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB53_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB53_3
|
|
; GCN2-NEXT: s_branch .LBB53_4
|
|
; GCN2-NEXT: .LBB53_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB53_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB53_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB53_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB53_3
|
|
; GFX12-NEXT: s_branch .LBB53_4
|
|
; GFX12-NEXT: .LBB53_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB53_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[0:1], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB53_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB54_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB54_4
|
|
; GCN1-NEXT: .LBB54_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB54_2
|
|
; GCN1-NEXT: .LBB54_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB54_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB54_4
|
|
; GCN2-NEXT: .LBB54_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB54_2
|
|
; GCN2-NEXT: .LBB54_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB54_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB54_4
|
|
; GFX12-NEXT: .LBB54_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB54_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB54_2
|
|
; GFX12-NEXT: .LBB54_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB55_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cbranch_execz .LBB55_3
|
|
; GCN1-NEXT: s_branch .LBB55_4
|
|
; GCN1-NEXT: .LBB55_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB55_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB55_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB55_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cbranch_execz .LBB55_3
|
|
; GCN2-NEXT: s_branch .LBB55_4
|
|
; GCN2-NEXT: .LBB55_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB55_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB55_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB55_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_cbranch_execz .LBB55_3
|
|
; GFX12-NEXT: s_branch .LBB55_4
|
|
; GFX12-NEXT: .LBB55_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB55_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB55_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_or_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB56_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB56_4
|
|
; GCN1-NEXT: .LBB56_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB56_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB56_2
|
|
; GCN1-NEXT: .LBB56_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_or_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_or_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB56_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB56_4
|
|
; GCN2-NEXT: .LBB56_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB56_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB56_2
|
|
; GCN2-NEXT: .LBB56_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_or_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_or_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB56_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB56_4
|
|
; GFX12-NEXT: .LBB56_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB56_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB56_2
|
|
; GFX12-NEXT: .LBB56_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_or_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_or_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_or_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB57_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB57_3
|
|
; GCN1-NEXT: s_branch .LBB57_4
|
|
; GCN1-NEXT: .LBB57_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB57_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_or_b32_e32 v4, s0, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_or_b32_e32 v5, s1, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB57_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB57_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB57_3
|
|
; GCN2-NEXT: s_branch .LBB57_4
|
|
; GCN2-NEXT: .LBB57_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB57_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_or_b32_e32 v4, s0, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_or_b32_e32 v5, s1, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB57_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB57_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB57_3
|
|
; GFX12-NEXT: s_branch .LBB57_4
|
|
; GFX12-NEXT: .LBB57_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB57_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_or_b32_e32 v3, s1, v1
|
|
; GFX12-NEXT: v_or_b32_e32 v2, s0, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB57_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_or_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB58_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB58_4
|
|
; GCN1-NEXT: .LBB58_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB58_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB58_2
|
|
; GCN1-NEXT: .LBB58_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_or_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_or_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB58_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB58_4
|
|
; GCN2-NEXT: .LBB58_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB58_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB58_2
|
|
; GCN2-NEXT: .LBB58_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_or_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_or_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB58_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB58_4
|
|
; GFX12-NEXT: .LBB58_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB58_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB58_2
|
|
; GFX12-NEXT: .LBB58_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_or_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_or_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_or_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB59_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB59_3
|
|
; GCN1-NEXT: s_branch .LBB59_4
|
|
; GCN1-NEXT: .LBB59_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB59_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_or_b32_e32 v4, s8, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_or_b32_e32 v5, s9, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB59_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB59_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB59_3
|
|
; GCN2-NEXT: s_branch .LBB59_4
|
|
; GCN2-NEXT: .LBB59_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB59_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_or_b32_e32 v4, s8, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_or_b32_e32 v5, s9, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB59_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB59_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB59_3
|
|
; GFX12-NEXT: s_branch .LBB59_4
|
|
; GFX12-NEXT: .LBB59_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB59_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_or_b32_e32 v3, s5, v1
|
|
; GFX12-NEXT: v_or_b32_e32 v2, s4, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB59_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_or_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB60_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB60_4
|
|
; GCN1-NEXT: .LBB60_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB60_2
|
|
; GCN1-NEXT: .LBB60_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_or_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_or_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB60_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB60_4
|
|
; GCN2-NEXT: .LBB60_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB60_2
|
|
; GCN2-NEXT: .LBB60_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_or_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_or_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB60_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB60_4
|
|
; GFX12-NEXT: .LBB60_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB60_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB60_2
|
|
; GFX12-NEXT: .LBB60_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_or_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_or_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_or_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB61_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB61_3
|
|
; GCN1-NEXT: s_branch .LBB61_4
|
|
; GCN1-NEXT: .LBB61_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB61_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_or_b32_e32 v4, s0, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_or_b32_e32 v5, s1, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB61_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB61_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB61_3
|
|
; GCN2-NEXT: s_branch .LBB61_4
|
|
; GCN2-NEXT: .LBB61_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB61_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_or_b32_e32 v4, s0, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_or_b32_e32 v5, s1, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB61_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB61_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB61_3
|
|
; GFX12-NEXT: s_branch .LBB61_4
|
|
; GFX12-NEXT: .LBB61_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB61_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_or_b32_e32 v3, s1, v1
|
|
; GFX12-NEXT: v_or_b32_e32 v2, s0, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB61_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_or_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB62_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB62_4
|
|
; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB62_2
|
|
; GCN1-NEXT: .LBB62_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_or_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_or_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB62_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB62_4
|
|
; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB62_2
|
|
; GCN2-NEXT: .LBB62_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_or_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_or_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB62_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB62_4
|
|
; GFX12-NEXT: .LBB62_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB62_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB62_2
|
|
; GFX12-NEXT: .LBB62_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_or_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_or_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_or_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB63_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB63_3
|
|
; GCN1-NEXT: s_branch .LBB63_4
|
|
; GCN1-NEXT: .LBB63_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB63_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_or_b32_e32 v4, s8, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_or_b32_e32 v5, s9, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB63_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB63_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB63_3
|
|
; GCN2-NEXT: s_branch .LBB63_4
|
|
; GCN2-NEXT: .LBB63_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB63_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_or_b32_e32 v4, s8, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_or_b32_e32 v5, s9, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB63_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB63_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB63_3
|
|
; GFX12-NEXT: s_branch .LBB63_4
|
|
; GFX12-NEXT: .LBB63_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB63_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_or_b32_e32 v3, s5, v1
|
|
; GFX12-NEXT: v_or_b32_e32 v2, s4, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB63_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_xchg_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB64_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB64_4
|
|
; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB64_2
|
|
; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB64_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB64_4
|
|
; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB64_2
|
|
; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB64_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB64_4
|
|
; GFX12-NEXT: .LBB64_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB64_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB64_2
|
|
; GFX12-NEXT: .LBB64_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
|
|
; GCN1-LABEL: atomic_xchg_f64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB65_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB65_4
|
|
; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB65_2
|
|
; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_f64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB65_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB65_4
|
|
; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB65_2
|
|
; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_f64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB65_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB65_4
|
|
; GFX12-NEXT: .LBB65_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB65_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB65_2
|
|
; GFX12-NEXT: .LBB65_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr double, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
|
|
; GCN1-LABEL: atomic_xchg_pointer_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB66_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB66_4
|
|
; GCN1-NEXT: .LBB66_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB66_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB66_2
|
|
; GCN1-NEXT: .LBB66_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_pointer_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB66_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB66_4
|
|
; GCN2-NEXT: .LBB66_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB66_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB66_2
|
|
; GCN2-NEXT: .LBB66_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_pointer_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB66_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB66_4
|
|
; GFX12-NEXT: .LBB66_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB66_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB66_2
|
|
; GFX12-NEXT: .LBB66_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr ptr, ptr %out, i32 4
|
|
%val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_xchg_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB67_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB67_3
|
|
; GCN1-NEXT: s_branch .LBB67_4
|
|
; GCN1-NEXT: .LBB67_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB67_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s1
|
|
; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB67_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB67_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB67_3
|
|
; GCN2-NEXT: s_branch .LBB67_4
|
|
; GCN2-NEXT: .LBB67_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB67_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s1
|
|
; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB67_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB67_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB67_3
|
|
; GFX12-NEXT: s_branch .LBB67_4
|
|
; GFX12-NEXT: .LBB67_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB67_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB67_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xchg_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB68_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB68_4
|
|
; GCN1-NEXT: .LBB68_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB68_2
|
|
; GCN1-NEXT: .LBB68_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB68_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB68_4
|
|
; GCN2-NEXT: .LBB68_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB68_2
|
|
; GCN2-NEXT: .LBB68_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB68_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB68_4
|
|
; GFX12-NEXT: .LBB68_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB68_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB68_2
|
|
; GFX12-NEXT: .LBB68_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xchg_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB69_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB69_3
|
|
; GCN1-NEXT: s_branch .LBB69_4
|
|
; GCN1-NEXT: .LBB69_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB69_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s9
|
|
; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB69_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB69_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB69_3
|
|
; GCN2-NEXT: s_branch .LBB69_4
|
|
; GCN2-NEXT: .LBB69_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB69_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s8
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s9
|
|
; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB69_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB69_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB69_3
|
|
; GFX12-NEXT: s_branch .LBB69_4
|
|
; GFX12-NEXT: .LBB69_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB69_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB69_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_xchg_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB70_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB70_4
|
|
; GCN1-NEXT: .LBB70_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB70_2
|
|
; GCN1-NEXT: .LBB70_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB70_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB70_4
|
|
; GCN2-NEXT: .LBB70_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB70_2
|
|
; GCN2-NEXT: .LBB70_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB70_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB70_4
|
|
; GFX12-NEXT: .LBB70_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB70_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB70_2
|
|
; GFX12-NEXT: .LBB70_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_xchg_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB71_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB71_3
|
|
; GCN1-NEXT: s_branch .LBB71_4
|
|
; GCN1-NEXT: .LBB71_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB71_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s1
|
|
; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB71_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB71_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB71_3
|
|
; GCN2-NEXT: s_branch .LBB71_4
|
|
; GCN2-NEXT: .LBB71_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB71_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s1
|
|
; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB71_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB71_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB71_3
|
|
; GFX12-NEXT: s_branch .LBB71_4
|
|
; GFX12-NEXT: .LBB71_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB71_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB71_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xchg_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB72_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB72_4
|
|
; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB72_2
|
|
; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB72_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB72_4
|
|
; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB72_2
|
|
; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB72_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB72_4
|
|
; GFX12-NEXT: .LBB72_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB72_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB72_2
|
|
; GFX12-NEXT: .LBB72_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xchg_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB73_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB73_3
|
|
; GCN1-NEXT: s_branch .LBB73_4
|
|
; GCN1-NEXT: .LBB73_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB73_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s9
|
|
; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB73_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB73_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB73_3
|
|
; GCN2-NEXT: s_branch .LBB73_4
|
|
; GCN2-NEXT: .LBB73_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB73_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s8
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s9
|
|
; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB73_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB73_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB73_3
|
|
; GFX12-NEXT: s_branch .LBB73_4
|
|
; GFX12-NEXT: .LBB73_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB73_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB73_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_xor_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB74_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB74_4
|
|
; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB74_2
|
|
; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB74_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB74_4
|
|
; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB74_2
|
|
; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB74_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB74_4
|
|
; GFX12-NEXT: .LBB74_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB74_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB74_2
|
|
; GFX12-NEXT: .LBB74_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_xor_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB75_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB75_3
|
|
; GCN1-NEXT: s_branch .LBB75_4
|
|
; GCN1-NEXT: .LBB75_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB75_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_xor_b32_e32 v4, s0, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_xor_b32_e32 v5, s1, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB75_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB75_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB75_3
|
|
; GCN2-NEXT: s_branch .LBB75_4
|
|
; GCN2-NEXT: .LBB75_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB75_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_xor_b32_e32 v4, s0, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_xor_b32_e32 v5, s1, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB75_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB75_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB75_3
|
|
; GFX12-NEXT: s_branch .LBB75_4
|
|
; GFX12-NEXT: .LBB75_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB75_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_xor_b32_e32 v3, s1, v1
|
|
; GFX12-NEXT: v_xor_b32_e32 v2, s0, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB75_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xor_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB76_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB76_4
|
|
; GCN1-NEXT: .LBB76_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB76_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB76_2
|
|
; GCN1-NEXT: .LBB76_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB76_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB76_4
|
|
; GCN2-NEXT: .LBB76_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB76_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB76_2
|
|
; GCN2-NEXT: .LBB76_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB76_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB76_4
|
|
; GFX12-NEXT: .LBB76_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB76_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB76_2
|
|
; GFX12-NEXT: .LBB76_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xor_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB77_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB77_3
|
|
; GCN1-NEXT: s_branch .LBB77_4
|
|
; GCN1-NEXT: .LBB77_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB77_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_xor_b32_e32 v4, s8, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_xor_b32_e32 v5, s9, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB77_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB77_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB77_3
|
|
; GCN2-NEXT: s_branch .LBB77_4
|
|
; GCN2-NEXT: .LBB77_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB77_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_xor_b32_e32 v4, s8, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_xor_b32_e32 v5, s9, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB77_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB77_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB77_3
|
|
; GFX12-NEXT: s_branch .LBB77_4
|
|
; GFX12-NEXT: .LBB77_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB77_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1
|
|
; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB77_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_xor_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB78_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB78_4
|
|
; GCN1-NEXT: .LBB78_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB78_2
|
|
; GCN1-NEXT: .LBB78_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB78_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB78_4
|
|
; GCN2-NEXT: .LBB78_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB78_2
|
|
; GCN2-NEXT: .LBB78_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB78_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB78_4
|
|
; GFX12-NEXT: .LBB78_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB78_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB78_2
|
|
; GFX12-NEXT: .LBB78_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_xor_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB79_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB79_3
|
|
; GCN1-NEXT: s_branch .LBB79_4
|
|
; GCN1-NEXT: .LBB79_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB79_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_xor_b32_e32 v4, s0, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_xor_b32_e32 v5, s1, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB79_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB79_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB79_3
|
|
; GCN2-NEXT: s_branch .LBB79_4
|
|
; GCN2-NEXT: .LBB79_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB79_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_xor_b32_e32 v4, s0, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_xor_b32_e32 v5, s1, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB79_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB79_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB79_3
|
|
; GFX12-NEXT: s_branch .LBB79_4
|
|
; GFX12-NEXT: .LBB79_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB79_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_xor_b32_e32 v3, s1, v1
|
|
; GFX12-NEXT: v_xor_b32_e32 v2, s0, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB79_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xor_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB80_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB80_4
|
|
; GCN1-NEXT: .LBB80_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB80_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB80_2
|
|
; GCN1-NEXT: .LBB80_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3
|
|
; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB80_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB80_4
|
|
; GCN2-NEXT: .LBB80_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB80_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB80_2
|
|
; GCN2-NEXT: .LBB80_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3
|
|
; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB80_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB80_4
|
|
; GFX12-NEXT: .LBB80_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB80_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB80_2
|
|
; GFX12-NEXT: .LBB80_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1
|
|
; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xor_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB81_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB81_3
|
|
; GCN1-NEXT: s_branch .LBB81_4
|
|
; GCN1-NEXT: .LBB81_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB81_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_xor_b32_e32 v4, s8, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_xor_b32_e32 v5, s9, v1
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB81_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB81_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB81_3
|
|
; GCN2-NEXT: s_branch .LBB81_4
|
|
; GCN2-NEXT: .LBB81_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB81_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_xor_b32_e32 v4, s8, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_xor_b32_e32 v5, s9, v1
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB81_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB81_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB81_3
|
|
; GFX12-NEXT: s_branch .LBB81_4
|
|
; GFX12-NEXT: .LBB81_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB81_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1
|
|
; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB81_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %in, i64 4
|
|
%val = load atomic i64, ptr %gep seq_cst, align 8
|
|
store i64 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8
|
|
store i64 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_load_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %in, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%val = load atomic i64, ptr %gep seq_cst, align 8
|
|
store i64 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_load_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %in, i64 %index
|
|
%val = load atomic i64, ptr %ptr seq_cst, align 8
|
|
store i64 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_u32 s0, s2, 32
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_u32 s0, s2, 32
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
store atomic i64 %in, ptr %gep seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
store atomic i64 %in, ptr %out seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_store_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
store atomic i64 %in, ptr %gep seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_store_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
store atomic i64 %in, ptr %ptr seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i64_soffset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 0x11940
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i64_soffset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 0x11940
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_soffset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 9000
|
|
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
|
|
%extract0 = extractvalue { i64, i1 } %val, 0
|
|
store i64 %extract0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s2
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, s3
|
|
; GCN1-NEXT: s_add_u32 s2, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s3, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s2
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, s3
|
|
; GCN2-NEXT: s_add_u32 s2, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s3, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3]
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
|
|
%extract0 = extractvalue { i64, i1 } %val, 0
|
|
store i64 %extract0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
|
|
%extract0 = extractvalue { i64, i1 } %val, 0
|
|
store i64 %extract0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s2, s4, s2
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s2, s4, s2
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3]
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
|
|
%extract0 = extractvalue { i64, i1 } %val, 0
|
|
store i64 %extract0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_f64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_f64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_f64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr double, ptr %in, i64 4
|
|
%val = load atomic double, ptr %gep seq_cst, align 8
|
|
store double %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_f64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_f64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_f64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8
|
|
store double %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_load_f64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_f64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_f64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr double, ptr %in, i64 %index
|
|
%gep = getelementptr double, ptr %ptr, i64 4
|
|
%val = load atomic double, ptr %gep seq_cst, align 8
|
|
store double %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_load_f64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_f64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_f64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr double, ptr %in, i64 %index
|
|
%val = load atomic double, ptr %ptr seq_cst, align 8
|
|
store double %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_f64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: s_add_u32 s0, s2, 32
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_f64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: s_add_u32 s0, s2, 32
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_f64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr double, ptr %out, i64 4
|
|
store atomic double %in, ptr %gep seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_f64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_f64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_f64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
store atomic double %in, ptr %out seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_store_f64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_f64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_f64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr double, ptr %out, i64 %index
|
|
%gep = getelementptr double, ptr %ptr, i64 4
|
|
store atomic double %in, ptr %gep seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_store_f64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_f64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_f64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr double, ptr %out, i64 %index
|
|
store atomic double %in, ptr %ptr seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_inc_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB107_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB107_4
|
|
; GCN1-NEXT: .LBB107_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB107_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB107_2
|
|
; GCN1-NEXT: .LBB107_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB107_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB107_4
|
|
; GCN2-NEXT: .LBB107_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB107_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB107_2
|
|
; GCN2-NEXT: .LBB107_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB107_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB107_4
|
|
; GFX12-NEXT: .LBB107_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB107_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB107_2
|
|
; GFX12-NEXT: .LBB107_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_inc_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB108_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB108_3
|
|
; GCN1-NEXT: s_branch .LBB108_4
|
|
; GCN1-NEXT: .LBB108_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB108_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB108_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB108_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB108_3
|
|
; GCN2-NEXT: s_branch .LBB108_4
|
|
; GCN2-NEXT: .LBB108_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB108_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB108_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB108_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB108_3
|
|
; GFX12-NEXT: s_branch .LBB108_4
|
|
; GFX12-NEXT: .LBB108_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB108_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB108_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_inc_i64_incr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB109_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB109_4
|
|
; GCN1-NEXT: .LBB109_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB109_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB109_2
|
|
; GCN1-NEXT: .LBB109_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i64_incr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB109_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB109_4
|
|
; GCN2-NEXT: .LBB109_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB109_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB109_2
|
|
; GCN2-NEXT: .LBB109_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_incr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB109_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB109_4
|
|
; GFX12-NEXT: .LBB109_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB109_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB109_2
|
|
; GFX12-NEXT: .LBB109_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_inc_i64_ret_incr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB110_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB110_3
|
|
; GCN1-NEXT: s_branch .LBB110_4
|
|
; GCN1-NEXT: .LBB110_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB110_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB110_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB110_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB110_3
|
|
; GCN2-NEXT: s_branch .LBB110_4
|
|
; GCN2-NEXT: .LBB110_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB110_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB110_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB110_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB110_3
|
|
; GFX12-NEXT: s_branch .LBB110_4
|
|
; GFX12-NEXT: .LBB110_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB110_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB110_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_inc_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB111_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB111_4
|
|
; GCN1-NEXT: .LBB111_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB111_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB111_2
|
|
; GCN1-NEXT: .LBB111_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB111_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB111_4
|
|
; GCN2-NEXT: .LBB111_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB111_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB111_2
|
|
; GCN2-NEXT: .LBB111_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB111_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB111_4
|
|
; GFX12-NEXT: .LBB111_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB111_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB111_2
|
|
; GFX12-NEXT: .LBB111_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_inc_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB112_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB112_3
|
|
; GCN1-NEXT: s_branch .LBB112_4
|
|
; GCN1-NEXT: .LBB112_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB112_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB112_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB112_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB112_3
|
|
; GCN2-NEXT: s_branch .LBB112_4
|
|
; GCN2-NEXT: .LBB112_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB112_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB112_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB112_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB112_3
|
|
; GFX12-NEXT: s_branch .LBB112_4
|
|
; GFX12-NEXT: .LBB112_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB112_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2
|
|
; GFX12-NEXT: .LBB112_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_inc_i64_incr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB113_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB113_4
|
|
; GCN1-NEXT: .LBB113_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB113_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB113_2
|
|
; GCN1-NEXT: .LBB113_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
|
|
; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i64_incr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB113_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB113_4
|
|
; GCN2-NEXT: .LBB113_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB113_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB113_2
|
|
; GCN2-NEXT: .LBB113_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
|
|
; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_incr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB113_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB113_4
|
|
; GFX12-NEXT: .LBB113_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB113_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB113_2
|
|
; GFX12-NEXT: .LBB113_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_inc_i64_ret_incr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB114_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB114_3
|
|
; GCN1-NEXT: s_branch .LBB114_4
|
|
; GCN1-NEXT: .LBB114_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB114_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB114_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i64_ret_incr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB114_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB114_3
|
|
; GCN2-NEXT: s_branch .LBB114_4
|
|
; GCN2-NEXT: .LBB114_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB114_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB114_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_ret_incr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB114_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB114_3
|
|
; GFX12-NEXT: s_branch .LBB114_4
|
|
; GFX12-NEXT: .LBB114_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB114_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
|
|
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
|
|
; GFX12-NEXT: .LBB114_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_dec_i64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB115_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB115_4
|
|
; GCN1-NEXT: .LBB115_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB115_2
|
|
; GCN1-NEXT: .LBB115_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0
|
|
; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB115_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB115_4
|
|
; GCN2-NEXT: .LBB115_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB115_2
|
|
; GCN2-NEXT: .LBB115_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0
|
|
; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB115_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB115_4
|
|
; GFX12-NEXT: .LBB115_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB115_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB115_2
|
|
; GFX12-NEXT: .LBB115_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s4, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
|
|
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
|
|
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_dec_i64_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB116_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB116_3
|
|
; GCN1-NEXT: s_branch .LBB116_4
|
|
; GCN1-NEXT: .LBB116_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB116_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3]
|
|
; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB116_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i64_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s3, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB116_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB116_3
|
|
; GCN2-NEXT: s_branch .LBB116_4
|
|
; GCN2-NEXT: .LBB116_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB116_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s2, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3]
|
|
; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB116_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], 32
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB116_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB116_3
|
|
; GFX12-NEXT: s_branch .LBB116_4
|
|
; GFX12-NEXT: .LBB116_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB116_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s4, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
|
|
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
|
|
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s3, s0
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s2, s0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s4
|
|
; GFX12-NEXT: .LBB116_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_dec_i64_decr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB117_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB117_4
|
|
; GCN1-NEXT: .LBB117_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB117_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB117_2
|
|
; GCN1-NEXT: .LBB117_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0
|
|
; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i64_decr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB117_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB117_4
|
|
; GCN2-NEXT: .LBB117_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB117_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB117_2
|
|
; GCN2-NEXT: .LBB117_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0
|
|
; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_decr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB117_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB117_4
|
|
; GFX12-NEXT: .LBB117_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB117_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB117_2
|
|
; GFX12-NEXT: .LBB117_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s4, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
|
|
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
|
|
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_dec_i64_ret_decr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB118_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB118_3
|
|
; GCN1-NEXT: s_branch .LBB118_4
|
|
; GCN1-NEXT: .LBB118_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB118_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3]
|
|
; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB118_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB118_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB118_3
|
|
; GCN2-NEXT: s_branch .LBB118_4
|
|
; GCN2-NEXT: .LBB118_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB118_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3]
|
|
; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB118_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB118_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB118_3
|
|
; GFX12-NEXT: s_branch .LBB118_4
|
|
; GFX12-NEXT: .LBB118_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB118_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s6, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
|
|
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1]
|
|
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
|
|
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
|
|
; GFX12-NEXT: .LBB118_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_dec_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB119_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB119_4
|
|
; GCN1-NEXT: .LBB119_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB119_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB119_2
|
|
; GCN1-NEXT: .LBB119_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0
|
|
; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s0
|
|
; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB119_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB119_4
|
|
; GCN2-NEXT: .LBB119_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB119_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB119_2
|
|
; GCN2-NEXT: .LBB119_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0
|
|
; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s5
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB119_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB119_4
|
|
; GFX12-NEXT: .LBB119_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB119_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB119_2
|
|
; GFX12-NEXT: .LBB119_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s4, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
|
|
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
|
|
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GCN1-LABEL: atomic_dec_i64_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB120_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB120_3
|
|
; GCN1-NEXT: s_branch .LBB120_4
|
|
; GCN1-NEXT: .LBB120_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB120_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1]
|
|
; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3]
|
|
; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB120_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i64_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_cmp_eq_u32 s5, s8
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB120_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB120_3
|
|
; GCN2-NEXT: s_branch .LBB120_4
|
|
; GCN2-NEXT: .LBB120_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB120_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GCN2-NEXT: s_cselect_b32 s2, s4, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_add_i32 s2, s2, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1]
|
|
; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3]
|
|
; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB120_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x34
|
|
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s5, s1
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB120_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB120_3
|
|
; GFX12-NEXT: s_branch .LBB120_4
|
|
; GFX12-NEXT: .LBB120_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB120_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX12-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
|
|
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s3, s0
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s2, s0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s4
|
|
; GFX12-NEXT: .LBB120_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_dec_i64_decr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN1-NEXT: s_cbranch_vccnz .LBB121_3
|
|
; GCN1-NEXT: ; %bb.1: ; %Flow
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB121_4
|
|
; GCN1-NEXT: .LBB121_2: ; %atomicrmw.phi
|
|
; GCN1-NEXT: s_endpgm
|
|
; GCN1-NEXT: .LBB121_3: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB121_2
|
|
; GCN1-NEXT: .LBB121_4: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
|
|
; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0
|
|
; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i64_decr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], -1
|
|
; GCN2-NEXT: s_cbranch_vccnz .LBB121_3
|
|
; GCN2-NEXT: ; %bb.1: ; %Flow
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB121_4
|
|
; GCN2-NEXT: .LBB121_2: ; %atomicrmw.phi
|
|
; GCN2-NEXT: s_endpgm
|
|
; GCN2-NEXT: .LBB121_3: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB121_2
|
|
; GCN2-NEXT: .LBB121_4: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
|
|
; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0
|
|
; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_decr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s7
|
|
; GFX12-NEXT: s_cselect_b32 s4, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_mov_b32 s4, -1
|
|
; GFX12-NEXT: s_cbranch_vccnz .LBB121_3
|
|
; GFX12-NEXT: ; %bb.1: ; %Flow
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB121_4
|
|
; GFX12-NEXT: .LBB121_2: ; %atomicrmw.phi
|
|
; GFX12-NEXT: s_endpgm
|
|
; GFX12-NEXT: .LBB121_3: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execnz .LBB121_2
|
|
; GFX12-NEXT: .LBB121_4: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s4, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
|
|
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1]
|
|
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
|
|
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_dec_i64_ret_decr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN1-NEXT: s_mov_b32 s14, -1
|
|
; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN1-NEXT: s_add_u32 s12, s12, s9
|
|
; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41
|
|
; GCN1-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_vccz .LBB122_2
|
|
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_cbranch_execz .LBB122_3
|
|
; GCN1-NEXT: s_branch .LBB122_4
|
|
; GCN1-NEXT: .LBB122_2:
|
|
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN1-NEXT: .LBB122_3: ; %atomicrmw.private
|
|
; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
|
|
; GCN1-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN1-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1]
|
|
; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3]
|
|
; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
|
|
; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen
|
|
; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen
|
|
; GCN1-NEXT: .LBB122_4: ; %atomicrmw.end
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i64_ret_decr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN2-NEXT: s_mov_b32 s90, -1
|
|
; GCN2-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN2-NEXT: s_add_u32 s88, s88, s9
|
|
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104
|
|
; GCN2-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_cmp_eq_u32 s1, s2
|
|
; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_vccz .LBB122_2
|
|
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_cbranch_execz .LBB122_3
|
|
; GCN2-NEXT: s_branch .LBB122_4
|
|
; GCN2-NEXT: .LBB122_2:
|
|
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GCN2-NEXT: .LBB122_3: ; %atomicrmw.private
|
|
; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GCN2-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_add_i32 s0, s0, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN2-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1]
|
|
; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3]
|
|
; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
|
|
; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
|
|
; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
|
|
; GCN2-NEXT: .LBB122_4: ; %atomicrmw.end
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_ret_decr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
|
|
; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, s9
|
|
; GFX12-NEXT: s_cselect_b32 s6, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
|
|
; GFX12-NEXT: s_cbranch_vccz .LBB122_2
|
|
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_cbranch_execz .LBB122_3
|
|
; GFX12-NEXT: s_branch .LBB122_4
|
|
; GFX12-NEXT: .LBB122_2:
|
|
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX12-NEXT: .LBB122_3: ; %atomicrmw.private
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX12-NEXT: s_cselect_b32 s6, s0, -1
|
|
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
|
|
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1]
|
|
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
|
|
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX12-NEXT: s_wait_alu 0xfffe
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
|
|
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
|
|
; GFX12-NEXT: .LBB122_4: ; %atomicrmw.end
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|