For targets that support xnack replay feature (gfx8+), the multi-dword scalar loads shouldn't clobber any register that holds the src address. The constrained version of the scalar loads have the early clobber flag attached to the dst operand to restrict RA from re-allocating any of the src regs for its dst operand.
9196 lines
370 KiB
LLVM
9196 lines
370 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN1 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN2 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw xchg
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xchg ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xchg ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xchg ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xchg ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw xchg f64
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_f64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_f64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_f64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i32 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_f64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_f64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_f64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xchg ptr %ptr, double %in seq_cst
|
|
ret double %result
|
|
}
|
|
|
|
define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i32 4
|
|
%result = atomicrmw xchg ptr %gep, double %in seq_cst
|
|
ret double %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_f64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_f64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_f64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, double inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i32 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_f64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_f64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_f64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xchg ptr %ptr, double %in seq_cst
|
|
ret double %result
|
|
}
|
|
|
|
define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, double inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i32 4
|
|
%result = atomicrmw xchg ptr %gep, double %in seq_cst
|
|
ret double %result
|
|
}
|
|
|
|
define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out, double %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out, double %in) {
|
|
; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i64 4
|
|
%result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret double %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw add
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_add_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_add_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_add_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_add_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_add_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_add_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_add_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_add_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_add_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw add ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_add_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_add_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_add_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw add ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_add_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_add_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_add_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_add_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_add_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_add_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_add_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_add_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_add_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw add ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_add_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_add_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_add_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw add ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw sub
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_sub_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_sub_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_sub_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_sub_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_sub_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_sub_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_sub_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_sub_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_sub_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw sub ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_sub_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_sub_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_sub_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw sub ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_sub_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_sub_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_sub_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_sub_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_sub_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_sub_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw sub ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw sub ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw and
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_and_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_and_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_and_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_and_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_and_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_and_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_and_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_and_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_and_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw and ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_and_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_and_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_and_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw and ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_and_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_and_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_and_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_and_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_and_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_and_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_and_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_and_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_and_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw and ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_and_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_and_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_and_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw and ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw nand
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_nand_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v6, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v7, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GCN1-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GCN1-NEXT: v_not_b32_e32 v5, v4
|
|
; GCN1-NEXT: v_not_b32_e32 v4, v8
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB50_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_nand_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v6, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v7, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GCN2-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GCN2-NEXT: v_not_b32_e32 v5, v4
|
|
; GCN2-NEXT: v_not_b32_e32 v4, v8
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB50_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_nand_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GCN3-NEXT: v_not_b32_e32 v5, v4
|
|
; GCN3-NEXT: v_not_b32_e32 v4, v8
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB50_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_nand_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v0, v7, v3
|
|
; GCN1-NEXT: v_and_b32_e32 v1, v6, v2
|
|
; GCN1-NEXT: v_not_b32_e32 v5, v0
|
|
; GCN1-NEXT: v_not_b32_e32 v4, v1
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB51_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_nand_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v0, v7, v3
|
|
; GCN2-NEXT: v_and_b32_e32 v1, v6, v2
|
|
; GCN2-NEXT: v_not_b32_e32 v5, v0
|
|
; GCN2-NEXT: v_not_b32_e32 v4, v1
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB51_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_nand_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GCN3-NEXT: v_not_b32_e32 v5, v4
|
|
; GCN3-NEXT: v_not_b32_e32 v4, v8
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB51_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_nand_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v4, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v5, v[5:6]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN1-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GCN1-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GCN1-NEXT: v_not_b32_e32 v5, v4
|
|
; GCN1-NEXT: v_not_b32_e32 v4, v8
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB52_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_nand_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v4, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v5, v[5:6]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN2-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GCN2-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GCN2-NEXT: v_not_b32_e32 v5, v4
|
|
; GCN2-NEXT: v_not_b32_e32 v4, v8
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB52_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_nand_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GCN3-NEXT: v_not_b32_e32 v5, v4
|
|
; GCN3-NEXT: v_not_b32_e32 v4, v8
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB52_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw nand ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_nand_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_and_b32_e32 v0, v9, v3
|
|
; GCN1-NEXT: v_and_b32_e32 v1, v8, v2
|
|
; GCN1-NEXT: v_not_b32_e32 v7, v0
|
|
; GCN1-NEXT: v_not_b32_e32 v6, v1
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB53_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_nand_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_and_b32_e32 v0, v9, v3
|
|
; GCN2-NEXT: v_and_b32_e32 v1, v8, v2
|
|
; GCN2-NEXT: v_not_b32_e32 v7, v0
|
|
; GCN2-NEXT: v_not_b32_e32 v6, v1
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB53_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_nand_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GCN3-NEXT: v_not_b32_e32 v5, v4
|
|
; GCN3-NEXT: v_not_b32_e32 v4, v8
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB53_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw nand ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_nand_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s35
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v3, v[3:4]
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GCN1-NEXT: v_not_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_not_b32_e32 v0, v6
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB54_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_nand_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s35
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v3, v[3:4]
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GCN2-NEXT: v_not_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_not_b32_e32 v0, v6
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB54_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_nand_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GCN3-NEXT: v_not_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_not_b32_e32 v0, v6
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB54_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN1-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN1-NEXT: flat_load_dword v3, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v2, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GCN1-NEXT: v_not_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_not_b32_e32 v0, v6
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB55_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN2-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN2-NEXT: flat_load_dword v3, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v2, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GCN2-NEXT: v_not_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_not_b32_e32 v0, v6
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB55_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GCN3-NEXT: v_not_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_not_b32_e32 v0, v6
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB55_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_nand_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v1, v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GCN1-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GCN1-NEXT: v_not_b32_e32 v5, v0
|
|
; GCN1-NEXT: v_not_b32_e32 v4, v1
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB56_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_nand_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v1, v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GCN2-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GCN2-NEXT: v_not_b32_e32 v5, v0
|
|
; GCN2-NEXT: v_not_b32_e32 v4, v1
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB56_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_nand_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN3-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GCN3-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GCN3-NEXT: v_not_b32_e32 v5, v0
|
|
; GCN3-NEXT: v_not_b32_e32 v4, v1
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB56_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw nand ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN1-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GCN1-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GCN1-NEXT: v_not_b32_e32 v5, v0
|
|
; GCN1-NEXT: v_not_b32_e32 v4, v1
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB57_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN2-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GCN2-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GCN2-NEXT: v_not_b32_e32 v5, v0
|
|
; GCN2-NEXT: v_not_b32_e32 v4, v1
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB57_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN3-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GCN3-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GCN3-NEXT: v_not_b32_e32 v5, v0
|
|
; GCN3-NEXT: v_not_b32_e32 v4, v1
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB57_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw nand ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_and_b32_e32 v0, v7, v3
|
|
; GCN1-NEXT: v_and_b32_e32 v1, v6, v2
|
|
; GCN1-NEXT: v_not_b32_e32 v5, v0
|
|
; GCN1-NEXT: v_not_b32_e32 v4, v1
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB58_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_and_b32_e32 v0, v7, v3
|
|
; GCN2-NEXT: v_and_b32_e32 v1, v6, v2
|
|
; GCN2-NEXT: v_not_b32_e32 v5, v0
|
|
; GCN2-NEXT: v_not_b32_e32 v4, v1
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB58_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GCN3-NEXT: v_not_b32_e32 v5, v4
|
|
; GCN3-NEXT: v_not_b32_e32 v4, v8
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB58_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_and_b32_e32 v0, v9, v3
|
|
; GCN1-NEXT: v_and_b32_e32 v1, v8, v2
|
|
; GCN1-NEXT: v_not_b32_e32 v7, v0
|
|
; GCN1-NEXT: v_not_b32_e32 v6, v1
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB59_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_and_b32_e32 v0, v9, v3
|
|
; GCN2-NEXT: v_and_b32_e32 v1, v8, v2
|
|
; GCN2-NEXT: v_not_b32_e32 v7, v0
|
|
; GCN2-NEXT: v_not_b32_e32 v6, v1
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB59_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GCN3-NEXT: v_not_b32_e32 v5, v4
|
|
; GCN3-NEXT: v_not_b32_e32 v4, v8
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB59_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw or
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_or_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_or_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_or_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_or_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_or_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_or_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_or_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_or_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_or_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw or ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_or_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_or_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_or_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw or ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_or_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_or_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_or_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_or_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_or_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_or_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_or_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_or_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_or_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw or ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_or_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_or_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_or_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw or ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw xor
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xor_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xor_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xor_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xor_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xor_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xor_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xor_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xor_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xor_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xor ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xor_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xor_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xor_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xor ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xor_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xor_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xor_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xor_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xor_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xor_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xor ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xor ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw max
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_max_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v6, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v7, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB80_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_max_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v6, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v7, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB80_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_max_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB80_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_max_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB81_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_max_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB81_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_max_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB81_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_max_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v4, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v5, v[5:6]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB82_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_max_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v4, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v5, v[5:6]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB82_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_max_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB82_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw max ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_max_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB83_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_max_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB83_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_max_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB83_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw max ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_max_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s35
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v3, v[3:4]
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB84_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_max_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s35
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v3, v[3:4]
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB84_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_max_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB84_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_max_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN1-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN1-NEXT: flat_load_dword v3, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v2, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB85_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_max_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN2-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN2-NEXT: flat_load_dword v3, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v2, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB85_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_max_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB85_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_max_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v1, v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB86_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_max_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v1, v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB86_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_max_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB86_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw max ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_max_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN1-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB87_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_max_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN2-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB87_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_max_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB87_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw max ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s2
|
|
; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB88_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s2
|
|
; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB88_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i64_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB88_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB89_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB89_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB89_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i64_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s2
|
|
; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB90_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s2
|
|
; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB90_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i64_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB90_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB91_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB91_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i64_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB91_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB92_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB92_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB92_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB93_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB93_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB93_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw umax
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umax_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v6, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v7, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB94_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umax_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v6, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v7, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB94_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umax_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB94_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umax_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB95_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umax_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB95_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umax_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB95_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umax_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v4, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v5, v[5:6]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB96_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umax_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v4, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v5, v[5:6]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB96_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umax_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB96_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw umax ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umax_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB97_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umax_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB97_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umax_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB97_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umax ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_umax_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s35
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v3, v[3:4]
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB98_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umax_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s35
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v3, v[3:4]
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB98_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umax_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB98_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN1-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN1-NEXT: flat_load_dword v3, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v2, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB99_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN2-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN2-NEXT: flat_load_dword v3, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v2, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB99_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB99_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_umax_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v1, v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB100_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umax_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v1, v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB100_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umax_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB100_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw umax ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN1-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB101_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN2-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB101_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB101_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umax ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s2
|
|
; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB102_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s2
|
|
; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB102_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB102_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB103_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB103_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB103_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB104_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB104_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB104_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB105_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB105_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB105_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB106_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB106_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB106_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw umin
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umin_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v6, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v7, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB107_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umin_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v6, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v7, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB107_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umin_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB107_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umin_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB108_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umin_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB108_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umin_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB108_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umin_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v4, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v5, v[5:6]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB109_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umin_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v4, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v5, v[5:6]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB109_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umin_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB109_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw umin ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umin_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB110_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umin_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB110_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umin_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB110_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umin ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_umin_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s35
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v3, v[3:4]
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB111_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umin_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s35
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v3, v[3:4]
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB111_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umin_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB111_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN1-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN1-NEXT: flat_load_dword v3, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v2, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB112_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN2-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN2-NEXT: flat_load_dword v3, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v2, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB112_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB112_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_umin_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v1, v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB113_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umin_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v1, v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB113_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umin_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB113_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw umin ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN1-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB114_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN2-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB114_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB114_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umin ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB115_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB115_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB115_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB116_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB116_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB116_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw min
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_min_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v6, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v7, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB117_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_min_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v6, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v7, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB117_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_min_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB117_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_min_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB118_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_min_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB118_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_min_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB118_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_min_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v4, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v5, v[5:6]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB119_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_min_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v4, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v5, v[5:6]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB119_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_min_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB119_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw min ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_min_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB120_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_min_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB120_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_min_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB120_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw min ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_min_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s35
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v3, v[3:4]
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB121_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_min_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s35
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v3, v[3:4]
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB121_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_min_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB121_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_min_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN1-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN1-NEXT: flat_load_dword v3, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v2, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB122_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_min_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN2-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN2-NEXT: flat_load_dword v3, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v2, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB122_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_min_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB122_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_min_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v1, v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB123_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_min_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v1, v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB123_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_min_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB123_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw min ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_min_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN1-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[2:3]
|
|
; GCN1-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB124_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_min_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: s_add_u32 s36, s4, 36
|
|
; GCN2-NEXT: s_addc_u32 s37, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[2:3]
|
|
; GCN2-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB124_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_min_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: s_mov_b64 s[34:35], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB124_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw min ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i64_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s2
|
|
; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB125_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s2
|
|
; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB125_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i64_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB125_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB126_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB126_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB126_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: atomic_min_i64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB127_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB127_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB127_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw min ptr %out, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i64_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB128_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i64_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s6
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v4, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB128_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i64_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: v_mov_b32_e32 v4, s9
|
|
; GCN3-NEXT: v_mov_b32_e32 v5, s8
|
|
; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN3-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB128_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN1-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB129_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v7, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v6, v[8:9]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GCN2-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB129_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB129_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN1-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB130_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: flat_load_dword v0, v[4:5]
|
|
; GCN2-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v8, v0
|
|
; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB130_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GCN3-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN3-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB130_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, v4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v5
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw uinc_wrap
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw udec_wrap
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN1-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s34, s4, 32
|
|
; GCN2-NEXT: s_addc_u32 s35, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
|
|
ret i64 %result
|
|
}
|
|
|
|
!0 = !{}
|