For targets that support xnack replay feature (gfx8+), the multi-dword scalar loads shouldn't clobber any register that holds the src address. The constrained version of the scalar loads have the early clobber flag attached to the dst operand to restrict RA from re-allocating any of the src regs for its dst operand.
8144 lines
294 KiB
LLVM
8144 lines
294 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN1 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN2 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_add_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_add_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw add ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_add_i32_max_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 0xffc
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i32_max_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 0xffc
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_add_i32_max_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:4092
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 1023
|
|
%val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_add_i32_max_offset_p1:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 0x1000
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i32_max_offset_p1:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 0x1000
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_add_i32_max_offset_p1:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
|
; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 1024
|
|
%val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_add_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_add_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_add_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_add_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_add_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_add_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_add_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_add_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_add_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_add_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_add_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_add_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_add v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_add_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_add_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_add_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_and_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_and v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_and v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_and_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_and_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_and_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_and_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_and v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_and v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_and_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_and_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_and_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_and_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_and v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_and v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_and_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_and v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_and_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_and_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_and_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_and v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_and v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_and_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_and v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_and_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_and_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_and_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_sub_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_sub v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_sub v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_sub_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_sub_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_sub_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_sub_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_sub v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_sub v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_sub_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_sub_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_sub v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_sub v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_sub_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_sub v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_sub_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_sub_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_sub_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_sub v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_sub v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_sub_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_sub v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_sub_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_sub_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_sub_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_max_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smax v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smax v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_max_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smax v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smax v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_max_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smax v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smax v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_smax v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_max_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smax v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smax v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_smax v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_umax_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umax v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umax v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_umax_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umax v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umax v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_umax_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umax v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umax v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_umax v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_umax_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umax v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umax v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_umax v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_min_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smin v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smin v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_min_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smin v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smin v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_min_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smin v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smin v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_smin v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_min_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smin v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smin v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_smin v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_umin_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umin v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umin v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_umin_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umin v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umin v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_umin_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umin v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umin v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_umin v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_umin_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umin v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umin v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_umin v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_or_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_or v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_or v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_or_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_or_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_or_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_or_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_or v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_or v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_or_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_or_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_or_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_or_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_or v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_or v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_or_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_or v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_or_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_or_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_or_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_or v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_or v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_or_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_or v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_or_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_or_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_or_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_xchg_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xchg_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) {
|
|
; GCN1-LABEL: atomic_xchg_f32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_f32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xchg_f32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr float, ptr %out, i32 4
|
|
%val = atomicrmw volatile xchg ptr %gep, float %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_xchg_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xchg_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xchg_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xchg_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_xchg_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xchg_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_xchg_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xchg_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xchg_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xchg_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xchg_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xchg_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xchg_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
; CMP_SWAP
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_cmpxchg_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s2, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s2, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s3, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
|
|
%flag = extractvalue { i32, i1 } %val, 0
|
|
store i32 %flag, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
|
|
%flag = extractvalue { i32, i1 } %val, 0
|
|
store i32 %flag, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_cmpxchg_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_cmpxchg_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
|
|
%flag = extractvalue { i32, i1 } %val, 0
|
|
store i32 %flag, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_cmpxchg_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
|
|
; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
|
|
%flag = extractvalue { i32, i1 } %val, 0
|
|
store i32 %flag, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_xor_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xor_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_xor_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xor_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xor_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xor_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_xor_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xor_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_xor_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xor_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xor_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xor_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_xor_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_xor_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_xor_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %in, i32 4
|
|
%val = load atomic i32, ptr %gep seq_cst, align 4
|
|
store i32 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = load atomic i32, ptr %in seq_cst, align 4
|
|
store i32 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_load_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %in, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = load atomic i32, ptr %gep seq_cst, align 4
|
|
store i32 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_load_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %in, i64 %index
|
|
%val = load atomic i32, ptr %ptr seq_cst, align 4
|
|
store i32 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
store atomic i32 %in, ptr %gep seq_cst, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
store atomic i32 %in, ptr %out seq_cst, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_store_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
store atomic i32 %in, ptr %gep seq_cst, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_store_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
store atomic i32 %in, ptr %ptr seq_cst, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_f32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_f32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_f32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr float, ptr %in, i32 4
|
|
%val = load atomic float, ptr %gep seq_cst, align 4
|
|
store float %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_f32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_f32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_f32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = load atomic float, ptr %in seq_cst, align 4
|
|
store float %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_load_f32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_f32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_f32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr float, ptr %in, i64 %index
|
|
%gep = getelementptr float, ptr %ptr, i32 4
|
|
%val = load atomic float, ptr %gep seq_cst, align 4
|
|
store float %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_load_f32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_f32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_f32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr float, ptr %in, i64 %index
|
|
%val = load atomic float, ptr %ptr seq_cst, align 4
|
|
store float %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_f32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_f32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_f32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr float, ptr %out, i32 4
|
|
store atomic float %in, ptr %gep seq_cst, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_f32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_f32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_f32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
store atomic float %in, ptr %out seq_cst, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_store_f32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_f32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_f32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr float, ptr %out, i64 %index
|
|
%gep = getelementptr float, ptr %ptr, i32 4
|
|
store atomic float %in, ptr %gep seq_cst, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_store_f32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_f32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_f32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr float, ptr %out, i64 %index
|
|
store atomic float %in, ptr %ptr seq_cst, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_i8_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i8_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_i8_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i8, ptr %in, i64 16
|
|
%val = load atomic i8, ptr %gep seq_cst, align 1
|
|
store i8 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_i8:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i8:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_i8:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = load atomic i8, ptr %in seq_cst, align 1
|
|
store i8 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_load_i8_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i8_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_i8_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i8, ptr %in, i64 %index
|
|
%gep = getelementptr i8, ptr %ptr, i64 16
|
|
%val = load atomic i8, ptr %gep seq_cst, align 1
|
|
store i8 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_i8_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i8_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_i8_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i8, ptr %out, i64 16
|
|
store atomic i8 %in, ptr %gep seq_cst, align 1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_i8:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i8:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_i8:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
store atomic i8 %in, ptr %out seq_cst, align 1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_store_i8_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s6
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s7
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i8_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s6
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s7
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_i8_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s6
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s7
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i8, ptr %out, i64 %index
|
|
%gep = getelementptr i8, ptr %ptr, i64 16
|
|
store atomic i8 %in, ptr %gep seq_cst, align 1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_i16_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i16_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_i16_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i16, ptr %in, i64 8
|
|
%val = load atomic i16, ptr %gep seq_cst, align 2
|
|
store i16 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_i16:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i16:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_i16:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = load atomic i16, ptr %in seq_cst, align 2
|
|
store i16 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_load_i16_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_i16_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, s4
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, s5
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_i16_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i16, ptr %in, i64 %index
|
|
%gep = getelementptr i16, ptr %ptr, i64 8
|
|
%val = load atomic i16, ptr %gep seq_cst, align 2
|
|
store i16 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_i16_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i16_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_i16_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i16, ptr %out, i64 8
|
|
store atomic i16 %in, ptr %gep seq_cst, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_i16:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i16:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_i16:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
store atomic i16 %in, ptr %out seq_cst, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) {
|
|
; GCN1-LABEL: atomic_store_i16_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_i16_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_i16_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i16, ptr %out, i64 %index
|
|
%gep = getelementptr i16, ptr %ptr, i64 8
|
|
store atomic i16 %in, ptr %gep seq_cst, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_f16_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_f16_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_f16_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr half, ptr %out, i64 8
|
|
store atomic half %in, ptr %gep seq_cst, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_f16:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_f16:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_f16:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
store atomic half %in, ptr %out seq_cst, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_bf16_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_bf16_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_bf16_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
%gep = getelementptr bfloat, ptr %out, i64 8
|
|
store atomic bfloat %in, ptr %out seq_cst, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_store_bf16:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_store_bf16:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_store_bf16:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
store atomic bfloat %in, ptr %out seq_cst, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_inc_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_inc_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_inc_i32_max_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 0xffc
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i32_max_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 0xffc
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_inc_i32_max_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:4092
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 1023
|
|
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_inc_i32_max_offset_p1:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 0x1000
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i32_max_offset_p1:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 0x1000
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_inc_i32_max_offset_p1:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
|
; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 1024
|
|
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_inc_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_inc_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_inc_i32_incr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i32_incr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_inc_i32_incr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_inc_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_inc_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_inc_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_inc_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_inc_i32_incr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i32_incr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_inc_i32_incr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_inc v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_inc_i32_ret_incr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_inc_i32_ret_incr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_inc_i32_ret_incr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_dec_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_dec_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_dec_i32_max_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 0xffc
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i32_max_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 0xffc
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_dec_i32_max_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:4092
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 1023
|
|
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_dec_i32_max_offset_p1:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 0x1000
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i32_max_offset_p1:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 0x1000
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_dec_i32_max_offset_p1:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
|
; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 1024
|
|
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_dec_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_dec_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_dec_i32_decr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i32_decr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_dec_i32_decr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_dec_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_dec_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_dec_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_dec_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_dec_i32_decr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i32_decr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_dec_i32_decr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN3-NEXT: flat_atomic_dec v[0:1], v2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_dec_i32_ret_decr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN1-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_dec_i32_ret_decr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN2-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_dec_i32_ret_decr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_f16_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_f16_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_f16_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
%gep = getelementptr half, ptr %in, i64 8
|
|
%val = load atomic half, ptr %gep seq_cst, align 2
|
|
store half %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_f16:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_f16:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_f16:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
%val = load atomic half, ptr %in seq_cst, align 2
|
|
store half %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_bf16_offset:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_bf16_offset:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_bf16_offset:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
%gep = getelementptr bfloat, ptr %in, i64 8
|
|
%val = load atomic bfloat, ptr %gep seq_cst, align 2
|
|
store bfloat %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
|
|
; GCN1-LABEL: atomic_load_bf16:
|
|
; GCN1: ; %bb.0:
|
|
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_store_short v[0:1], v2
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_load_bf16:
|
|
; GCN2: ; %bb.0:
|
|
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_store_short v[0:1], v2
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_load_bf16:
|
|
; GCN3: ; %bb.0:
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN3-NEXT: flat_store_short v[0:1], v2
|
|
; GCN3-NEXT: s_endpgm
|
|
%val = load atomic bfloat, ptr %in seq_cst, align 2
|
|
store bfloat %val, ptr %out
|
|
ret void
|
|
}
|