Files
clang-p2996/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
Christudasan Devadasan 229e118559 [AMDGPU] Codegen support for constrained multi-dword sloads (#96163)
For targets that support xnack replay feature (gfx8+), the
multi-dword scalar loads shouldn't clobber any register that
holds the src address. The constrained version of the scalar
loads have the early clobber flag attached to the dst operand
to restrict RA from re-allocating any of the src regs for its
dst operand.
2024-07-23 13:59:15 +05:30

8144 lines
294 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN1 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN2 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_add_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_add v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_add_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_add v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_add_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw add ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_add_i32_max_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 0xffc
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_add v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_add_i32_max_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 0xffc
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_add v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_add_i32_max_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:4092
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 1023
%val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_add_i32_max_offset_p1:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 0x1000
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_add v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_add_i32_max_offset_p1:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 0x1000
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_add v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_add_i32_max_offset_p1:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_add v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 1024
%val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_add_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_add_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_add_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_add_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_add v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_add_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_add v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_add_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_add_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_add_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_add_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_add_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_add v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_add_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_add v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_add_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_add v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_add_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_add_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_add_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_add_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_add v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_add_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_add v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_add_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_add v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_add_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_add_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_add_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_and_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_and v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_and_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_and v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_and_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_and_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_and_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_and_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_and_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_and v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_and_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_and v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_and_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_and_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_and_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_and_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_and_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_and v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_and_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_and v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_and_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_and v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_and_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_and_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_and_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_and_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_and v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_and_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_and v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_and_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_and v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_and_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_and_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_and_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_sub_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_sub v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_sub_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_sub v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_sub_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_sub_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_sub_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_sub_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_sub_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_sub v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_sub_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_sub v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_sub_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_sub_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_sub v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_sub_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_sub v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_sub_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_sub v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_sub_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_sub_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_sub_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_sub_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_sub v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_sub_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_sub v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_sub_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_sub v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_sub_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_sub_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_sub_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_max_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smax v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smax v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_max_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_max_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smax v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smax v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_max_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smax v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smax v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_smax v[0:1], v2
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_max_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_max_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smax v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smax v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_smax v[0:1], v2
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_max_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_umax_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umax v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umax v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_umax_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umax_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umax v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umax v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_umax_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umax v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umax v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_umax v[0:1], v2
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_umax_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umax_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umax v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umax v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_umax v[0:1], v2
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umax_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_min_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smin v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smin v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_min_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_min_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smin v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smin v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_min_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smin v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smin v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_smin v[0:1], v2
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_min_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_min_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smin v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smin v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_smin v[0:1], v2
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_min_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_umin_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umin v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umin v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_umin_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umin_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umin v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umin v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_umin_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umin v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umin v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_umin v[0:1], v2
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_umin_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umin_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umin v[0:1], v2
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umin v[0:1], v2
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_umin v[0:1], v2
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umin_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_or_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_or v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_or_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_or v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_or_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_or_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_or_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_or_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_or_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_or v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_or_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_or v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_or_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_or_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_or_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_or_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_or_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_or v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_or_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_or v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_or_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_or v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_or_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_or_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_or_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_or_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_or v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_or_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_or v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_or_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_or v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_or_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_or_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_or_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_xchg_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_swap v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xchg_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_swap v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xchg_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) {
; GCN1-LABEL: atomic_xchg_f32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_swap v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xchg_f32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_swap v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xchg_f32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr float, ptr %out, i32 4
%val = atomicrmw volatile xchg ptr %gep, float %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_xchg_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xchg_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xchg_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_xchg_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_swap v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xchg_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_swap v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xchg_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_xchg_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_swap v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xchg_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_swap v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xchg_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_swap v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_xchg_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xchg_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xchg_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_xchg_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_swap v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xchg_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_swap v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xchg_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_swap v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_xchg_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xchg_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xchg_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
; CMP_SWAP
define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) {
; GCN1-LABEL: atomic_cmpxchg_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_cmpxchg_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_cmpxchg_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) {
; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s2, s4, 16
; GCN1-NEXT: s_addc_u32 s3, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s2, s4, 16
; GCN2-NEXT: s_addc_u32 s3, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
%flag = extractvalue { i32, i1 } %val, 0
store i32 %flag, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) {
; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s2
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s2
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: v_mov_b32_e32 v0, s8
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s2
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: v_mov_b32_e32 v0, s8
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s2
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: v_mov_b32_e32 v0, s8
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v1, s9
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
%flag = extractvalue { i32, i1 } %val, 0
store i32 %flag, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
; GCN1-LABEL: atomic_cmpxchg_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_cmpxchg_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_cmpxchg_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) {
; GCN1-LABEL: atomic_cmpxchg_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_cmpxchg_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_cmpxchg_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
%flag = extractvalue { i32, i1 } %val, 0
store i32 %flag, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) {
; GCN1-LABEL: atomic_cmpxchg_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s2
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_cmpxchg_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s2
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_cmpxchg_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: v_mov_b32_e32 v0, s8
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s2
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: v_mov_b32_e32 v0, s8
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s2
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: v_mov_b32_e32 v0, s8
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v1, s9
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
%flag = extractvalue { i32, i1 } %val, 0
store i32 %flag, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_xor_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_xor v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xor_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_xor v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xor_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_xor_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xor_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xor_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_xor_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_xor v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xor_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_xor v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xor_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_xor_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_xor v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xor_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_xor v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xor_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_xor v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_xor_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xor_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xor_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_xor_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_xor v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xor_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_xor v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xor_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_xor v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_xor_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_xor_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_xor_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %in, i32 4
%val = load atomic i32, ptr %gep seq_cst, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = load atomic i32, ptr %in seq_cst, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_load_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN1-NEXT: s_add_u32 s0, s0, s4
; GCN1-NEXT: s_addc_u32 s1, s1, s5
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN2-NEXT: s_add_u32 s0, s0, s4
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %in, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = load atomic i32, ptr %gep seq_cst, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_load_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN1-NEXT: s_add_u32 s0, s0, s4
; GCN1-NEXT: s_addc_u32 s1, s1, s5
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN2-NEXT: s_add_u32 s0, s0, s4
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %in, i64 %index
%val = load atomic i32, ptr %ptr seq_cst, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) {
; GCN1-LABEL: atomic_store_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
store atomic i32 %in, ptr %gep seq_cst, align 4
ret void
}
define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) {
; GCN1-LABEL: atomic_store_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
store atomic i32 %in, ptr %out seq_cst, align 4
ret void
}
define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_store_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
store atomic i32 %in, ptr %gep seq_cst, align 4
ret void
}
define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_store_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
store atomic i32 %in, ptr %ptr seq_cst, align 4
ret void
}
define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_f32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_f32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr float, ptr %in, i32 4
%val = load atomic float, ptr %gep seq_cst, align 4
store float %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_f32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_f32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = load atomic float, ptr %in seq_cst, align 4
store float %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_load_f32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN1-NEXT: s_add_u32 s0, s0, s4
; GCN1-NEXT: s_addc_u32 s1, s1, s5
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_f32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN2-NEXT: s_add_u32 s0, s0, s4
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr float, ptr %in, i64 %index
%gep = getelementptr float, ptr %ptr, i32 4
%val = load atomic float, ptr %gep seq_cst, align 4
store float %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_load_f32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN1-NEXT: s_add_u32 s0, s0, s4
; GCN1-NEXT: s_addc_u32 s1, s1, s5
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_f32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN2-NEXT: s_add_u32 s0, s0, s4
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr float, ptr %in, i64 %index
%val = load atomic float, ptr %ptr seq_cst, align 4
store float %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) {
; GCN1-LABEL: atomic_store_f32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_f32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_f32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr float, ptr %out, i32 4
store atomic float %in, ptr %gep seq_cst, align 4
ret void
}
define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) {
; GCN1-LABEL: atomic_store_f32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_f32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_f32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
store atomic float %in, ptr %out seq_cst, align 4
ret void
}
define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_store_f32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_f32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_f32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr float, ptr %out, i64 %index
%gep = getelementptr float, ptr %ptr, i32 4
store atomic float %in, ptr %gep seq_cst, align 4
ret void
}
define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_store_f32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_f32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_f32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr float, ptr %out, i64 %index
store atomic float %in, ptr %ptr seq_cst, align 4
ret void
}
define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_i8_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_byte v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_i8_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i8_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr %in, i64 16
%val = load atomic i8, ptr %gep seq_cst, align 1
store i8 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_i8:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_byte v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_i8:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i8:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = load atomic i8, ptr %in seq_cst, align 1
store i8 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_load_i8_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_byte v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_i8_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i8_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i8, ptr %in, i64 %index
%gep = getelementptr i8, ptr %ptr, i64 16
%val = load atomic i8, ptr %gep seq_cst, align 1
store i8 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) {
; GCN1-LABEL: atomic_store_i8_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_byte v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_i8_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_i8_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr %out, i64 16
store atomic i8 %in, ptr %gep seq_cst, align 1
ret void
}
define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) {
; GCN1-LABEL: atomic_store_i8:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_byte v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_i8:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_i8:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
store atomic i8 %in, ptr %out seq_cst, align 1
ret void
}
define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_store_i8_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, s6
; GCN1-NEXT: s_addc_u32 s1, s5, s7
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_byte v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_i8_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, s6
; GCN2-NEXT: s_addc_u32 s1, s5, s7
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_i8_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_add_u32 s0, s4, s6
; GCN3-NEXT: s_addc_u32 s1, s5, s7
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i8, ptr %out, i64 %index
%gep = getelementptr i8, ptr %ptr, i64 16
store atomic i8 %in, ptr %gep seq_cst, align 1
ret void
}
define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_i16_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_i16_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i16_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i16, ptr %in, i64 8
%val = load atomic i16, ptr %gep seq_cst, align 2
store i16 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_i16:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_i16:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i16:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = load atomic i16, ptr %in seq_cst, align 2
store i16 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_load_i16_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
; GCN1-NEXT: s_add_u32 s0, s0, s4
; GCN1-NEXT: s_addc_u32 s1, s1, s5
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_i16_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
; GCN2-NEXT: s_add_u32 s0, s0, s4
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i16_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i16, ptr %in, i64 %index
%gep = getelementptr i16, ptr %ptr, i64 8
%val = load atomic i16, ptr %gep seq_cst, align 2
store i16 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) {
; GCN1-LABEL: atomic_store_i16_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_i16_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_i16_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i16, ptr %out, i64 8
store atomic i16 %in, ptr %gep seq_cst, align 2
ret void
}
define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) {
; GCN1-LABEL: atomic_store_i16:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_i16:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_i16:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
store atomic i16 %in, ptr %out seq_cst, align 2
ret void
}
define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) {
; GCN1-LABEL: atomic_store_i16_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_i16_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_i16_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i16, ptr %out, i64 %index
%gep = getelementptr i16, ptr %ptr, i64 8
store atomic i16 %in, ptr %gep seq_cst, align 2
ret void
}
define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) {
; GCN1-LABEL: atomic_store_f16_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_f16_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_f16_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr half, ptr %out, i64 8
store atomic half %in, ptr %gep seq_cst, align 2
ret void
}
define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) {
; GCN1-LABEL: atomic_store_f16:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_f16:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_f16:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
store atomic half %in, ptr %out seq_cst, align 2
ret void
}
define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) {
; GCN1-LABEL: atomic_store_bf16_offset:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_bf16_offset:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_bf16_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr %out, i64 8
store atomic bfloat %in, ptr %out seq_cst, align 2
ret void
}
define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) {
; GCN1-LABEL: atomic_store_bf16:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_store_bf16:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_store_bf16:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
store atomic bfloat %in, ptr %out seq_cst, align 2
ret void
}
define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_inc_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_inc_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_inc_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_inc_i32_max_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 0xffc
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_inc_i32_max_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 0xffc
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_inc_i32_max_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:4092
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 1023
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_inc_i32_max_offset_p1:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 0x1000
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_inc_i32_max_offset_p1:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 0x1000
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_inc_i32_max_offset_p1:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_inc v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 1024
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_inc_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_inc_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_inc_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_inc_i32_incr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_inc_i32_incr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_inc_i32_incr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_inc_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_inc_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_inc_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_inc v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_inc_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_inc_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_inc_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_inc_i32_incr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_inc_i32_incr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_inc_i32_incr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_inc v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_inc_i32_ret_incr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_inc_i32_ret_incr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_inc_i32_ret_incr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_dec_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_dec_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_dec_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_dec_i32_max_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 0xffc
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_dec_i32_max_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 0xffc
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_dec_i32_max_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:4092
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 1023
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_dec_i32_max_offset_p1:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 0x1000
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_dec_i32_max_offset_p1:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 0x1000
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_dec_i32_max_offset_p1:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_dec v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 1024
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_dec_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_dec_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_dec_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_dec_i32_decr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_dec_i32_decr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_dec_i32_decr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_dec_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_dec_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_dec_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: flat_atomic_dec v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_dec_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_dec_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_dec_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_dec_i32_decr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_dec_i32_decr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_dec_i32_decr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: flat_atomic_dec v[0:1], v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
ret void
}
define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_dec_i32_ret_decr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_dec_i32_ret_decr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_dec_i32_ret_decr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s8
; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_f16_offset:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_f16_offset:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f16_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr half, ptr %in, i64 8
%val = load atomic half, ptr %gep seq_cst, align 2
store half %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_f16:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_f16:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f16:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%val = load atomic half, ptr %in seq_cst, align 2
store half %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_bf16_offset:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_bf16_offset:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_bf16_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr %in, i64 8
%val = load atomic bfloat, ptr %gep seq_cst, align 2
store bfloat %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
; GCN1-LABEL: atomic_load_bf16:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_store_short v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_load_bf16:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_bf16:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%val = load atomic bfloat, ptr %in seq_cst, align 2
store bfloat %val, ptr %out
ret void
}