Files
clang-p2996/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll
Austin Kerbow e501ed84aa [AMDGPU] Don't flush vmcnt for loops with use/def pairs
Conditions for hoisting vmcnt with flat instructions should be similar to VMEM.
If there are use/def pairs in a loop body we cannot guarantee that hosting the
waitcnt will be profitable. Better heuristics are needed to analyse whether
gains from avoiding waitcnt in loop bodys outweighs waiting for loads in the
preheader.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D151126
2023-06-02 22:55:12 -07:00

3253 lines
127 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN1 %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN2 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_max_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s2, 16
; GCN1-NEXT: s_addc_u32 s1, s3, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_max_i32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB0_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s2, 16
; GCN2-NEXT: s_addc_u32 s1, s3, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_i32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB0_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_i32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB0_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_max_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_max_i32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB1_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_max_i32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB1_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: v_mov_b32_e32 v3, s5
; GCN3-NEXT: v_max_i32_e32 v0, s2, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB1_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_max_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s6, s0
; GCN1-NEXT: s_addc_u32 s1, s7, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_max_i32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB2_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_i32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB2_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s6, s0
; GCN3-NEXT: s_addc_u32 s1, s7, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_i32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB2_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_max_i32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB3_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_max_i32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB3_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: v_max_i32_e32 v0, s8, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB3_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_max_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_max_i32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB4_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_i32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB4_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: flat_load_dword v1, v[0:1]
; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_i32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB4_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile max ptr %out, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_max_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: v_max_i32_e32 v0, s2, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB5_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: v_max_i32_e32 v0, s2, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB5_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: v_mov_b32_e32 v3, s5
; GCN3-NEXT: v_max_i32_e32 v0, s2, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB5_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile max ptr %out, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_max_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s6, s0
; GCN1-NEXT: s_addc_u32 s1, s7, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_max_i32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB6_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_i32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB6_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s6, s0
; GCN3-NEXT: s_addc_u32 s1, s7, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v1, v[0:1]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_i32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB6_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile max ptr %ptr, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_max_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_max_i32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB7_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_max_i32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB7_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v0, v[0:1]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: v_max_i32_e32 v0, s8, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB7_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile max ptr %ptr, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_umax_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s2, 16
; GCN1-NEXT: s_addc_u32 s1, s3, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB8_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_max_u32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB8_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s2, 16
; GCN2-NEXT: s_addc_u32 s1, s3, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB8_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_u32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB8_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
; GCN3-NEXT: .LBB8_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_u32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB8_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_umax_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: .LBB9_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_max_u32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB9_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: .LBB9_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_max_u32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB9_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
; GCN3-NEXT: .LBB9_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: v_mov_b32_e32 v3, s5
; GCN3-NEXT: v_max_u32_e32 v0, s2, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB9_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umax_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s6, s0
; GCN1-NEXT: s_addc_u32 s1, s7, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB10_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_max_u32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB10_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB10_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_u32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB10_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s6, s0
; GCN3-NEXT: s_addc_u32 s1, s7, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB10_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_u32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB10_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB11_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_max_u32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB11_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB11_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_max_u32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB11_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB11_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: v_max_u32_e32 v0, s8, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB11_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_umax_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: .LBB12_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_max_u32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB12_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: .LBB12_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_u32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB12_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: flat_load_dword v1, v[0:1]
; GCN3-NEXT: .LBB12_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_u32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB12_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile umax ptr %out, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_umax_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: .LBB13_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: v_max_u32_e32 v0, s2, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB13_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: .LBB13_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: v_max_u32_e32 v0, s2, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB13_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
; GCN3-NEXT: .LBB13_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: v_mov_b32_e32 v3, s5
; GCN3-NEXT: v_max_u32_e32 v0, s2, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB13_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile umax ptr %out, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umax_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s6, s0
; GCN1-NEXT: s_addc_u32 s1, s7, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB14_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_max_u32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB14_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB14_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_u32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB14_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s6, s0
; GCN3-NEXT: s_addc_u32 s1, s7, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v1, v[0:1]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB14_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_u32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB14_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile umax ptr %ptr, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umax_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB15_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_max_u32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB15_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB15_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_max_u32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB15_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v0, v[0:1]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB15_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: v_max_u32_e32 v0, s8, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB15_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile umax ptr %ptr, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_min_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s2, 16
; GCN1-NEXT: s_addc_u32 s1, s3, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB16_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_min_i32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB16_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s2, 16
; GCN2-NEXT: s_addc_u32 s1, s3, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB16_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_min_i32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB16_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
; GCN3-NEXT: .LBB16_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_i32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB16_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_min_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: .LBB17_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_min_i32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB17_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: .LBB17_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_min_i32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB17_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
; GCN3-NEXT: .LBB17_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: v_mov_b32_e32 v3, s5
; GCN3-NEXT: v_min_i32_e32 v0, s2, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB17_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_min_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s6, s0
; GCN1-NEXT: s_addc_u32 s1, s7, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB18_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_min_i32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB18_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB18_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_min_i32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB18_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s6, s0
; GCN3-NEXT: s_addc_u32 s1, s7, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB18_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_i32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB18_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB19_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_min_i32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB19_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB19_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_min_i32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB19_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB19_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: v_min_i32_e32 v0, s8, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB19_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_min_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: .LBB20_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_min_i32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB20_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: .LBB20_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_min_i32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB20_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: flat_load_dword v1, v[0:1]
; GCN3-NEXT: .LBB20_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_i32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB20_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile min ptr %out, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_min_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: .LBB21_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: v_min_i32_e32 v0, s2, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB21_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: .LBB21_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: v_min_i32_e32 v0, s2, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB21_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
; GCN3-NEXT: .LBB21_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: v_mov_b32_e32 v3, s5
; GCN3-NEXT: v_min_i32_e32 v0, s2, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB21_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile min ptr %out, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_min_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s6, s0
; GCN1-NEXT: s_addc_u32 s1, s7, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB22_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_min_i32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB22_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB22_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_min_i32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB22_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s6, s0
; GCN3-NEXT: s_addc_u32 s1, s7, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v1, v[0:1]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB22_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_i32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB22_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile min ptr %ptr, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_min_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB23_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_min_i32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB23_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB23_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_min_i32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB23_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v0, v[0:1]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB23_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: v_min_i32_e32 v0, s8, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB23_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile min ptr %ptr, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_umin_i32_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s2, 16
; GCN1-NEXT: s_addc_u32 s1, s3, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_min_u32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB24_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s2, 16
; GCN2-NEXT: s_addc_u32 s1, s3, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_min_u32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB24_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
; GCN3-NEXT: .LBB24_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_u32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB24_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_umin_i32_ret_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s0, s4, 16
; GCN1-NEXT: s_addc_u32 s1, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_min_u32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB25_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_min_u32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB25_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_ret_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
; GCN3-NEXT: .LBB25_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: v_mov_b32_e32 v3, s5
; GCN3-NEXT: v_min_u32_e32 v0, s2, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB25_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umin_i32_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s6, s0
; GCN1-NEXT: s_addc_u32 s1, s7, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_min_u32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB26_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_min_u32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB26_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s6, s0
; GCN3-NEXT: s_addc_u32 s1, s7, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB26_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_u32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB26_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_min_u32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB27_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_min_u32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB27_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB27_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: v_min_u32_e32 v0, s8, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB27_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
%val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_umin_i32:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_min_u32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB28_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_min_u32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB28_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: flat_load_dword v1, v[0:1]
; GCN3-NEXT: .LBB28_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_u32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB28_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile umin ptr %out, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GCN1-LABEL: atomic_umin_i32_ret:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: v_min_u32_e32 v0, s2, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB29_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: v_min_u32_e32 v0, s2, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB29_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_ret:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
; GCN3-NEXT: .LBB29_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: v_mov_b32_e32 v3, s5
; GCN3-NEXT: v_min_u32_e32 v0, s2, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB29_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%val = atomicrmw volatile umin ptr %out, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umin_i32_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GCN1-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s6, s0
; GCN1-NEXT: s_addc_u32 s1, s7, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_min_u32_e32 v0, s4, v1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB30_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_min_u32_e32 v0, s4, v1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB30_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s6, s0
; GCN3-NEXT: s_addc_u32 s1, s7, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v1, v[0:1]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_u32_e32 v0, s4, v1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB30_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile umin ptr %ptr, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
; GCN1-LABEL: atomic_umin_i32_ret_addr64:
; GCN1: ; %bb.0: ; %entry
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN1-NEXT: s_add_u32 s0, s4, s0
; GCN1-NEXT: s_addc_u32 s1, s5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
; GCN1-NEXT: flat_load_dword v0, v[0:1]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v1, v0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_min_u32_e32 v0, s8, v1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB31_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN1-NEXT: v_mov_b32_e32 v1, s6
; GCN1-NEXT: v_mov_b32_e32 v2, s7
; GCN1-NEXT: flat_store_dword v[1:2], v0
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umin_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v0, v[0:1]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v1, v0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_min_u32_e32 v0, s8, v1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB31_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN2-NEXT: v_mov_b32_e32 v1, s6
; GCN2-NEXT: v_mov_b32_e32 v2, s7
; GCN2-NEXT: flat_store_dword v[1:2], v0
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umin_i32_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v0, v[0:1]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v1, v0
; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: v_min_u32_e32 v0, s8, v1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB31_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN3-NEXT: v_mov_b32_e32 v1, s6
; GCN3-NEXT: v_mov_b32_e32 v2, s7
; GCN3-NEXT: flat_store_dword v[1:2], v0
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%val = atomicrmw volatile umin ptr %ptr, i32 %in seq_cst
store i32 %val, ptr %out2
ret void
}