Conditions for hoisting vmcnt with flat instructions should be similar to VMEM. If there are use/def pairs in a loop body we cannot guarantee that hosting the waitcnt will be profitable. Better heuristics are needed to analyse whether gains from avoiding waitcnt in loop bodys outweighs waiting for loads in the preheader. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D151126
3253 lines
127 KiB
LLVM
3253 lines
127 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN1 %s
|
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN2 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_max_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s2, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s2, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
|
|
; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_max_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_max_i32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB1_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_max_i32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB1_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
|
|
; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: v_max_i32_e32 v0, s2, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB1_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB2_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB2_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB2_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_max_i32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB3_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_max_i32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB3_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: v_max_i32_e32 v0, s8, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB3_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_max_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB4_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB4_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB4_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile max ptr %out, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_max_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: v_max_i32_e32 v0, s2, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB5_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: v_max_i32_e32 v0, s2, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB5_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: v_max_i32_e32 v0, s2, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB5_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile max ptr %out, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB6_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB6_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_max_i32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB6_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile max ptr %ptr, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_max_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_max_i32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB7_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_max_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_max_i32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB7_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_max_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: v_max_i32_e32 v0, s8, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB7_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile max ptr %ptr, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_umax_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s2, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB8_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB8_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s2, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB8_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB8_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
|
|
; GCN3-NEXT: .LBB8_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB8_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_umax_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: .LBB9_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_max_u32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB9_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: .LBB9_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_max_u32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB9_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
|
|
; GCN3-NEXT: .LBB9_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: v_max_u32_e32 v0, s2, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB9_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB10_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB10_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB10_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB10_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB10_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB10_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB11_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_max_u32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB11_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB11_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_max_u32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB11_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB11_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: v_max_u32_e32 v0, s8, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB11_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_umax_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: .LBB12_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB12_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: .LBB12_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB12_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN3-NEXT: .LBB12_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB12_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile umax ptr %out, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_umax_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: .LBB13_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: v_max_u32_e32 v0, s2, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB13_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: .LBB13_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: v_max_u32_e32 v0, s2, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB13_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN3-NEXT: .LBB13_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: v_max_u32_e32 v0, s2, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB13_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile umax ptr %out, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB14_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB14_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB14_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB14_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB14_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_max_u32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB14_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile umax ptr %ptr, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umax_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB15_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_max_u32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB15_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umax_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB15_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_max_u32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB15_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umax_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB15_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: v_max_u32_e32 v0, s8, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB15_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile umax ptr %ptr, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_min_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s2, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB16_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB16_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s2, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB16_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB16_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
|
|
; GCN3-NEXT: .LBB16_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB16_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_min_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: .LBB17_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_min_i32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB17_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: .LBB17_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_min_i32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB17_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
|
|
; GCN3-NEXT: .LBB17_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: v_min_i32_e32 v0, s2, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB17_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB18_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB18_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB18_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB18_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB18_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB18_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB19_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_min_i32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB19_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB19_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_min_i32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB19_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB19_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: v_min_i32_e32 v0, s8, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB19_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_min_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: .LBB20_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB20_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: .LBB20_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB20_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN3-NEXT: .LBB20_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB20_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile min ptr %out, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_min_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: .LBB21_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: v_min_i32_e32 v0, s2, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB21_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: .LBB21_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: v_min_i32_e32 v0, s2, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB21_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN3-NEXT: .LBB21_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: v_min_i32_e32 v0, s2, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB21_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile min ptr %out, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB22_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB22_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB22_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB22_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB22_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_min_i32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB22_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile min ptr %ptr, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_min_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB23_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_min_i32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB23_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_min_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB23_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_min_i32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB23_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_min_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB23_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: v_min_i32_e32 v0, s8, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB23_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile min ptr %ptr, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_umin_i32_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s2, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB24_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s2, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s3, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB24_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
|
|
; GCN3-NEXT: .LBB24_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB24_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_umin_i32_ret_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_min_u32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB25_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_ret_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_add_u32 s0, s4, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_min_u32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB25_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_ret_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
|
|
; GCN3-NEXT: .LBB25_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: v_min_u32_e32 v0, s2, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB25_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr i32, ptr %out, i32 4
|
|
%val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i32_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB26_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB26_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB26_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB26_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN1-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_min_u32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB27_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN2-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_min_u32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB27_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB27_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: v_min_u32_e32 v0, s8, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB27_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%gep = getelementptr i32, ptr %ptr, i32 4
|
|
%val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) {
|
|
; GCN1-LABEL: atomic_umin_i32:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB28_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB28_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN3-NEXT: .LBB28_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB28_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile umin ptr %out, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) {
|
|
; GCN1-LABEL: atomic_umin_i32_ret:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
|
|
; GCN1-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN1-NEXT: v_min_u32_e32 v0, s2, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB29_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_ret:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN2-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN2-NEXT: v_min_u32_e32 v0, s2, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB29_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_ret:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN3-NEXT: s_mov_b64 s[0:1], 0
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN3-NEXT: .LBB29_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN3-NEXT: v_min_u32_e32 v0, s2, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB29_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%val = atomicrmw volatile umin ptr %out, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i32_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
|
|
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB30_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB30_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s6, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s7, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v1, v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_min_u32_e32 v0, s4, v1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB30_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile umin ptr %ptr, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
|
|
; GCN1-LABEL: atomic_umin_i32_ret_addr64:
|
|
; GCN1: ; %bb.0: ; %entry
|
|
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
|
|
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN1-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN1-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN1-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN1-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start
|
|
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN1-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN1-NEXT: v_min_u32_e32 v0, s8, v1
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN1-NEXT: buffer_wbinvl1_vol
|
|
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: s_cbranch_execnz .LBB31_1
|
|
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN1-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN1-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN1-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN1-NEXT: s_endpgm
|
|
;
|
|
; GCN2-LABEL: atomic_umin_i32_ret_addr64:
|
|
; GCN2: ; %bb.0: ; %entry
|
|
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN2-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN2-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN2-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN2-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start
|
|
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN2-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN2-NEXT: v_min_u32_e32 v0, s8, v1
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN2-NEXT: buffer_wbinvl1_vol
|
|
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: s_cbranch_execnz .LBB31_1
|
|
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN2-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN2-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN2-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN2-NEXT: s_endpgm
|
|
;
|
|
; GCN3-LABEL: atomic_umin_i32_ret_addr64:
|
|
; GCN3: ; %bb.0: ; %entry
|
|
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
|
|
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
|
|
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
|
|
; GCN3-NEXT: s_add_u32 s0, s4, s0
|
|
; GCN3-NEXT: s_addc_u32 s1, s5, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN3-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN3-NEXT: s_mov_b64 s[2:3], 0
|
|
; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start
|
|
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN3-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN3-NEXT: v_min_u32_e32 v0, s8, v1
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN3-NEXT: buffer_wbinvl1_vol
|
|
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: s_cbranch_execnz .LBB31_1
|
|
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN3-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN3-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN3-NEXT: flat_store_dword v[1:2], v0
|
|
; GCN3-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i32, ptr %out, i64 %index
|
|
%val = atomicrmw volatile umin ptr %ptr, i32 %in seq_cst
|
|
store i32 %val, ptr %out2
|
|
ret void
|
|
}
|